diff --git a/common/chat.cpp b/common/chat.cpp index 70b9f5dc2c58..38f7a2ed744a 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1661,6 +1661,7 @@ static common_chat_params common_chat_params_init_gigachat_v3( static common_chat_params common_chat_params_init_deepseek_v3_2(const common_chat_template & tmpl, const autoparser::generation_params & inputs) { common_chat_params data; + const auto & src = tmpl.source(); data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs); data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; @@ -1681,8 +1682,9 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha const std::string DSML = "|DSML|"; const std::string THINK_START = ""; const std::string THINK_END = ""; - const std::string FC_START = "<" + DSML + "function_calls>"; - const std::string FC_END = ""; + const std::string FC_NAME = src.find("function_calls") != std::string::npos ? "function_calls" : "tool_calls"; + const std::string FC_START = "<" + DSML + FC_NAME + ">"; + const std::string FC_END = ""; const std::string INVOKE_START = "<" + DSML + "invoke"; const std::string INVOKE_END = ""; const std::string PARAM_START = "<" + DSML + "parameter"; @@ -2093,12 +2095,12 @@ std::optional common_chat_try_specialized_template( return common_chat_params_init_gigachat_v3(tmpl, params); } - // DeepSeek V3.2 format detection: template defines dsml_token and uses it for tool calls. + // DeepSeek DSML format detection: template defines dsml_token and uses it for tool calls. // The template source contains the token as a variable assignment, not as a literal in markup. if (src.find("dsml_token") != std::string::npos && - src.find("function_calls") != std::string::npos && + (src.find("function_calls") != std::string::npos || src.find("tool_calls") != std::string::npos) && src.find("DSML") != std::string::npos) { - LOG_DBG("Using specialized template: DeepSeek V3.2\n"); + LOG_DBG("Using specialized template: DeepSeek DSML\n"); return common_chat_params_init_deepseek_v3_2(tmpl, params); } diff --git a/conversion/__init__.py b/conversion/__init__.py index 2c38123dff8d..bba37a5cbbc7 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -47,6 +47,7 @@ "DeepseekForCausalLM": "deepseek", "DeepseekV2ForCausalLM": "deepseek", "DeepseekV3ForCausalLM": "deepseek", + "DeepseekV4ForCausalLM": "deepseek", "DistilBertForMaskedLM": "bert", "DistilBertForSequenceClassification": "bert", "DistilBertModel": "bert", diff --git a/conversion/deepseek.py b/conversion/deepseek.py index e149fcbf752e..86a3046b9e98 100644 --- a/conversion/deepseek.py +++ b/conversion/deepseek.py @@ -1,18 +1,26 @@ from __future__ import annotations +import concurrent.futures +import ctypes +import math +import os import re -from typing import Any, Callable, Iterable, TYPE_CHECKING +from pathlib import Path +from typing import Any, Callable, Iterable, Sequence, TYPE_CHECKING +import numpy as np import torch if TYPE_CHECKING: from torch import Tensor -from .base import MmprojModel, ModelBase, TextModel, gguf, logger +from .base import LazyTorchTensor, MmprojModel, ModelBase, TextModel, gguf, logger from .qwen import QwenModel +TORCH_FLOAT8_E8M0FNU = getattr(torch, "float8_e8m0fnu", None) + @ModelBase.register("DeepseekOCRForCausalLM") class DeepseekOCRVisionModel(MmprojModel): @@ -386,3 +394,648 @@ def prepare_tensors(self): experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("DeepseekV4ForCausalLM") +class DeepseekV4Model(TextModel): + model_arch = gguf.MODEL_ARCH.DEEPSEEK4 + + # Optional DeepSeek V4 debug / expert-quant knobs. In the pre-#17114 + # monolithic convert_hf_to_gguf.py these were ModelBase.__init__ params + # wired to --deepseek4-* CLI flags. The refactored conversion/base.py + # ModelBase.__init__ does not accept them, so they default here; the + # standard DeepseekV4ForCausalLM conversion path does not require them. + deepseek4_max_layers: int | None = None + deepseek4_expert_outtypes: str | None = None + deepseek4_expert_workers: int = 1 + + _experts: list[dict[str, Tensor]] | None = None + + _fp4_table = torch.tensor([ + 0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0, + 0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0, + ], dtype=torch.float32) + + _qtype_aliases: dict[str, gguf.GGMLQuantizationType] = { + "q8_0": gguf.GGMLQuantizationType.Q8_0, + "q2_k": gguf.GGMLQuantizationType.Q2_K, + "iq2_xxs": gguf.GGMLQuantizationType.IQ2_XXS, + "iq2_xs": gguf.GGMLQuantizationType.IQ2_XS, + "tq1_0": gguf.GGMLQuantizationType.TQ1_0, + "tq2_0": gguf.GGMLQuantizationType.TQ2_0, + } + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self._deepseek4_original_block_count = self.block_count + if self.deepseek4_max_layers is not None: + if self.deepseek4_max_layers <= 0: + raise ValueError("--deepseek4-max-layers must be positive") + if self.deepseek4_max_layers > self.block_count: + raise ValueError( + f"--deepseek4-max-layers={self.deepseek4_max_layers} exceeds model layer count {self.block_count}" + ) + self.block_count = self.deepseek4_max_layers + self.hparams["num_hidden_layers"] = self.block_count + self.hparams["n_layers"] = self.block_count + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + logger.warning( + "DeepSeek V4 debug export: writing only the first %d/%d transformer layers", + self.block_count, + self._deepseek4_original_block_count, + ) + + self._deepseek4_expert_qtypes = self._parse_expert_outtype_spec(self.deepseek4_expert_outtypes) + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + self.hparams["num_key_value_heads"] = self.hparams.get("num_key_value_heads", 1) + + super().set_gguf_parameters() + hparams = self.hparams + + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) + self.gguf_writer.add_attention_output_lora_rank(hparams["o_lora_rank"]) + self.gguf_writer.add_attention_output_group_count(hparams["o_groups"]) + self.gguf_writer.add_attention_compress_ratios(hparams["compress_ratios"]) + self.gguf_writer.add_attention_compress_rope_freq_base(hparams["compress_rope_theta"]) + + self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) + self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) + self.gguf_writer.add_expert_weights_scale(hparams.get("routed_scaling_factor", 1.0)) + self.gguf_writer.add_hash_layer_count(min(hparams["num_hash_layers"], self.block_count)) + if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None: + self.gguf_writer.add_expert_weights_norm(norm_topk_prob) + if (swiglu_limit := hparams.get("swiglu_limit")) is not None and float(swiglu_limit) > 0.0: + self.gguf_writer.add_swiglu_clamp_exp([float(swiglu_limit)] * self.block_count) + + if (sliding_window := hparams.get("sliding_window")) is not None: + self.gguf_writer.add_sliding_window(sliding_window) + + self.gguf_writer.add_indexer_head_count(hparams["index_n_heads"]) + self.gguf_writer.add_indexer_key_length(hparams["index_head_dim"]) + self.gguf_writer.add_indexer_top_k(hparams["index_topk"]) + + if self.deepseek4_max_layers is None and (num_nextn_predict_layers := hparams.get("num_nextn_predict_layers")) is not None: + self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers) + + self.gguf_writer.add_hyper_connection_count(hparams["hc_mult"]) + self.gguf_writer.add_hyper_connection_sinkhorn_iters(hparams["hc_sinkhorn_iters"]) + self.gguf_writer.add_hyper_connection_eps(hparams["hc_eps"]) + + @staticmethod + def _strip_model_prefix(name: str) -> str: + return name.removeprefix("model.") + + def _skip_layer_tensor(self, stripped_name: str) -> bool: + if self.deepseek4_max_layers is None: + return False + match = re.match(r"layers\.(\d+)\.", stripped_name) + return match is not None and int(match.group(1)) >= self.block_count + + @staticmethod + def _is_low_bit_ftype(ftype: gguf.LlamaFileType) -> bool: + return ftype in ( + gguf.LlamaFileType.MOSTLY_TQ1_0, + gguf.LlamaFileType.MOSTLY_TQ2_0, + gguf.LlamaFileType.MOSTLY_Q2_K, + gguf.LlamaFileType.MOSTLY_IQ2_XXS, + gguf.LlamaFileType.MOSTLY_IQ2_XS, + ) + + @staticmethod + def _qtype_for_ftype(ftype: gguf.LlamaFileType) -> gguf.GGMLQuantizationType | None: + return { + gguf.LlamaFileType.MOSTLY_TQ1_0: gguf.GGMLQuantizationType.TQ1_0, + gguf.LlamaFileType.MOSTLY_TQ2_0: gguf.GGMLQuantizationType.TQ2_0, + gguf.LlamaFileType.MOSTLY_Q2_K: gguf.GGMLQuantizationType.Q2_K, + gguf.LlamaFileType.MOSTLY_IQ2_XXS: gguf.GGMLQuantizationType.IQ2_XXS, + gguf.LlamaFileType.MOSTLY_IQ2_XS: gguf.GGMLQuantizationType.IQ2_XS, + gguf.LlamaFileType.MOSTLY_Q8_0: gguf.GGMLQuantizationType.Q8_0, + }.get(ftype) + + @classmethod + def _parse_qtype_name(cls, name: str) -> gguf.GGMLQuantizationType: + qtype = cls._qtype_aliases.get(name.strip().lower()) + if qtype is None: + allowed = ", ".join(sorted(cls._qtype_aliases)) + raise ValueError(f"unknown DeepSeek V4 expert outtype {name!r}; expected one of: {allowed}") + return qtype + + @classmethod + def _parse_expert_outtype_spec(cls, spec: str | None) -> dict[str, gguf.GGMLQuantizationType]: + if spec is None: + return {} + + result: dict[str, gguf.GGMLQuantizationType] = {} + for item in spec.split(","): + item = item.strip() + if not item: + continue + if "=" not in item: + qtype = cls._parse_qtype_name(item) + result.update({"w1": qtype, "w2": qtype, "w3": qtype}) + continue + key, value = (part.strip().lower() for part in item.split("=", 1)) + if key not in ("w1", "w2", "w3", "gate", "down", "up"): + raise ValueError(f"unknown DeepSeek V4 expert tensor selector {key!r}") + wid = {"gate": "w1", "down": "w2", "up": "w3"}.get(key, key) + result[wid] = cls._parse_qtype_name(value) + return result + + @staticmethod + def _scale_to_float(scale: Tensor) -> Tensor: + if TORCH_FLOAT8_E8M0FNU is not None and scale.dtype == TORCH_FLOAT8_E8M0FNU: + return scale.float() + + if scale.dtype in (torch.uint8, torch.int8): + e = scale.view(torch.uint8).to(torch.int32) + bits = torch.where( + e == 0, + torch.full_like(e, 0x00400000), + e << 23, + ) + return bits.view(torch.float32) + + return scale.float() + + @staticmethod + def _scale_to_e8m0_bytes(scale: Tensor) -> Tensor: + if TORCH_FLOAT8_E8M0FNU is not None and scale.dtype == TORCH_FLOAT8_E8M0FNU: + return scale.view(torch.uint8) + if scale.dtype in (torch.uint8, torch.int8): + return scale.view(torch.uint8) + + scale = scale.float() + e = torch.where( + scale > 0, + torch.floor(torch.log2(scale)).to(torch.int32) + 127, + torch.zeros_like(scale, dtype=torch.int32), + ) + return torch.clamp(e, 0, 255).to(torch.uint8) + + @classmethod + def _dequant_fp8_weight(cls, weight: Tensor, scale: Tensor, block_size: Sequence[int]) -> Tensor: + if len(block_size) != 2: + raise ValueError(f"DeepSeek V4 expects 2D FP8 block scales, got block size {block_size}") + + block_out, block_in = block_size + out_dim, in_dim = weight.shape + if out_dim % block_out != 0 or in_dim % block_in != 0: + raise ValueError(f"FP8 tensor shape {tuple(weight.shape)} is not divisible by block size {block_size}") + + scale = cls._scale_to_float(scale) + expected_scale = (out_dim // block_out, in_dim // block_in) + if tuple(scale.shape) != expected_scale: + raise ValueError(f"FP8 scale shape {tuple(scale.shape)} does not match expected {expected_scale}") + + weight = weight.reshape(out_dim // block_out, block_out, in_dim // block_in, block_in) + weight = weight.float() * scale[:, None, :, None] + return weight.reshape(out_dim, in_dim) + + @classmethod + def _dequant_fp4_weight(cls, weight: Tensor, scale: Tensor) -> Tensor: + weight = weight.view(torch.uint8) + out_dim, packed_in_dim = weight.shape + in_dim = packed_in_dim * 2 + if in_dim % 32 != 0: + raise ValueError(f"FP4 packed tensor shape {tuple(weight.shape)} does not contain 32-value blocks") + + n_blocks = in_dim // 32 + scale = cls._scale_to_float(scale) + if tuple(scale.shape) != (out_dim, n_blocks): + raise ValueError(f"FP4 scale shape {tuple(scale.shape)} does not match expected {(out_dim, n_blocks)}") + + fp4_table = cls._fp4_table.to(weight.device) + packed = weight.reshape(out_dim, n_blocks, 16) + low = packed & 0x0F + high = (packed >> 4) & 0x0F + vals = torch.stack((low, high), dim=-1).reshape(out_dim, n_blocks, 32) + vals = fp4_table[vals.long()] * scale.unsqueeze(-1) + return vals.reshape(out_dim, in_dim) + + @classmethod + def _pack_fp4_as_mxfp4(cls, weight: Tensor, scale: Tensor) -> tuple[np.ndarray, list[int]]: + weight = weight.view(torch.uint8) + out_dim, packed_in_dim = weight.shape + in_dim = packed_in_dim * 2 + if in_dim % 32 != 0: + raise ValueError(f"FP4 packed tensor shape {tuple(weight.shape)} does not contain 32-value blocks") + + n_blocks = in_dim // 32 + scale_e = cls._scale_to_e8m0_bytes(scale) + if tuple(scale_e.shape) != (out_dim, n_blocks): + raise ValueError(f"FP4 scale shape {tuple(scale_e.shape)} does not match expected {(out_dim, n_blocks)}") + + packed = weight.reshape(out_dim, n_blocks, 16) + low = packed & 0x0F + high = (packed >> 4) & 0x0F + vals = torch.stack((low, high), dim=-1).reshape(out_dim, n_blocks, 32) + qs = vals[:, :, :16] | (vals[:, :, 16:] << 4) + raw = torch.cat((scale_e.unsqueeze(-1), qs), dim=-1).reshape(out_dim, n_blocks * 17) + return raw.numpy(), [out_dim, in_dim] + + _ggml_quant_lib: Any = None + + @classmethod + def _load_ggml_quant_lib(cls): + if cls._ggml_quant_lib is not None: + return cls._ggml_quant_lib + + # This module lives in the conversion/ package; the repo root (where + # build/bin/libggml.* lands) is its parent's parent. In the pre-#17114 + # monolithic convert_hf_to_gguf.py, __file__ was the repo-root script, + # so .parent alone was the repo root -- search both so the lookup is + # correct regardless of package layout. + repo_root = Path(__file__).resolve().parent.parent + pkg_root = Path(__file__).resolve().parent + candidates = [ + os.environ.get("LLAMA_CPP_LIBGGML"), + repo_root / "build" / "bin" / "libggml.dylib", + repo_root / "build" / "bin" / "libggml.so", + repo_root / "build" / "bin" / "ggml.dll", + pkg_root / "build" / "bin" / "libggml.dylib", + pkg_root / "build" / "bin" / "libggml.so", + pkg_root / "build" / "bin" / "ggml.dll", + ] + for candidate in candidates: + if candidate is None: + continue + path = Path(candidate) + if not path.is_file(): + continue + lib = ctypes.CDLL(str(path)) + lib.ggml_quantize_chunk.restype = ctypes.c_size_t + lib.ggml_quantize_chunk.argtypes = ( + ctypes.c_int, + ctypes.POINTER(ctypes.c_float), + ctypes.c_void_p, + ctypes.c_int64, + ctypes.c_int64, + ctypes.c_int64, + ctypes.POINTER(ctypes.c_float), + ) + lib.ggml_quantize_requires_imatrix.restype = ctypes.c_bool + lib.ggml_quantize_requires_imatrix.argtypes = (ctypes.c_int,) + cls._ggml_quant_lib = lib + return lib + + raise RuntimeError( + "DeepSeek V4 low-bit expert conversion needs llama.cpp's libggml. " + "Build llama.cpp first or set LLAMA_CPP_LIBGGML to libggml." + ) + + @classmethod + def _quantize_deepseek4_expert(cls, data: np.ndarray, qtype: gguf.GGMLQuantizationType) -> np.ndarray: + c_quantized_types = { + gguf.GGMLQuantizationType.Q2_K, + gguf.GGMLQuantizationType.IQ2_XXS, + gguf.GGMLQuantizationType.IQ2_XS, + } + if qtype not in c_quantized_types: + return gguf.quants.quantize(data, qtype) + + data = np.ascontiguousarray(data, dtype=np.float32) + out = np.zeros(gguf.quant_shape_to_byte_shape(data.shape, qtype), dtype=np.uint8, order="C") + lib = cls._load_ggml_quant_lib() + nrows = math.prod(data.shape[:-1]) + n_per_row = data.shape[-1] + imatrix = ctypes.cast(0, ctypes.POINTER(ctypes.c_float)) + if lib.ggml_quantize_requires_imatrix(qtype.value): + qw = np.ascontiguousarray(np.sum(data.reshape(-1, n_per_row) ** 2, axis=0), dtype=np.float32) + imatrix = qw.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) + result_size = lib.ggml_quantize_chunk( + qtype.value, + data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), + out.ctypes.data_as(ctypes.c_void_p), + 0, + nrows, + n_per_row, + imatrix, + ) + if result_size != out.size: + raise RuntimeError(f"ggml_quantize_chunk wrote {result_size} bytes, expected {out.size}") + return out + + def _write_deepseek4_tid2eid_tensors(self) -> set[str]: + consumed: set[str] = set() + for name in list(self.model_tensors.keys()): + stripped = self._strip_model_prefix(name) + if self._skip_layer_tensor(stripped): + consumed.add(name) + continue + if re.match(r"layers\.\d+\.ffn\.gate\.tid2eid$", stripped) is None: + continue + + data = LazyTorchTensor.to_eager(self.model_tensors[name]()).to(torch.int32).numpy() + new_name = self.map_tensor_name(stripped) + logger.info(f"{new_name}, int32 --> I32, shape = {{{', '.join(str(n) for n in reversed(data.shape))}}}") + self.gguf_writer.add_tensor(new_name, data) + consumed.add(name) + return consumed + + def _write_deepseek4_expert_tensors(self) -> set[str]: + default_qtype = self._qtype_for_ftype(self.ftype) + if default_qtype is None and not self._deepseek4_expert_qtypes: + if any(re.match(r"(?:model\.)?layers\.\d+\.ffn\.experts\.\d+\.w[123]\.weight$", name) for name in self.model_tensors): + raise NotImplementedError( + "DeepSeek V4 routed FP4 experts must be converted directly to a compact GGUF type. " + "Use --outtype iq2_xxs, iq2_xs, q2_k, tq2_0, tq1_0, or q8_0." + ) + return set() + + n_experts = self.hparams["n_routed_experts"] + consumed: set[str] = set() + groups: dict[tuple[int, str], dict[int, tuple[str, str]]] = {} + + for name in list(self.model_tensors.keys()): + stripped = self._strip_model_prefix(name) + if self._skip_layer_tensor(stripped): + consumed.add(name) + continue + match = re.match(r"layers\.(\d+)\.ffn\.experts\.(\d+)\.(w[123])\.weight$", stripped) + if match is None: + continue + + bid = int(match.group(1)) + xid = int(match.group(2)) + wid = match.group(3) + qtype = self._deepseek4_expert_qtypes.get(wid, default_qtype) + if qtype is None: + raise RuntimeError(f"No DeepSeek V4 expert quantization type selected for {wid}") + scale_name = f"{stripped.removesuffix('.weight')}.scale" + model_scale_name = scale_name if scale_name in self.model_tensors else f"model.{scale_name}" + if model_scale_name not in self.model_tensors: + raise ValueError(f"Missing DeepSeek V4 FP4 scale tensor for {stripped}") + + groups.setdefault((bid, wid), {})[xid] = (name, model_scale_name) + consumed.update((name, model_scale_name)) + + def convert_one(name: str, model_scale_name: str, qtype: gguf.GGMLQuantizationType) -> np.ndarray: + weight = LazyTorchTensor.to_eager(self.model_tensors[name]()) + scale = LazyTorchTensor.to_eager(self.model_tensors[model_scale_name]()) + + if qtype == gguf.GGMLQuantizationType.MXFP4: + data, _ = self._pack_fp4_as_mxfp4(weight, scale) + return data + + data = self._dequant_fp4_weight(weight, scale).numpy() + return self._quantize_deepseek4_expert(data, qtype) + + def add_merged_tensor(bid: int, wid: str, qtype: gguf.GGMLQuantizationType, experts: dict[int, np.ndarray]) -> None: + missing = sorted(set(range(n_experts)).difference(experts)) + if missing: + raise ValueError(f"Missing DeepSeek V4 expert tensors for layer {bid} {wid}: {missing[:8]}") + + merged = np.stack([experts[i] for i in range(n_experts)], axis=0) + merged_name = f"layers.{bid}.ffn.experts.{wid}.weight" + new_name = self.map_tensor_name(merged_name) + shape = gguf.quant_shape_from_byte_shape(merged.shape, qtype) if merged.dtype == np.uint8 else merged.shape + shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" + logger.info(f"{new_name}, DeepSeek FP4 --> {qtype.name}, shape = {shape_str}") + self.gguf_writer.add_tensor(new_name, merged, raw_dtype=qtype) + + worker_count = max(1, self.deepseek4_expert_workers) + for bid, wid in sorted(groups): + qtype = self._deepseek4_expert_qtypes.get(wid, default_qtype) + if qtype is None: + raise RuntimeError(f"No DeepSeek V4 expert quantization type selected for {wid}") + group = groups[(bid, wid)] + experts: dict[int, np.ndarray] = {} + logger.info( + "DeepSeek V4: quantizing blk.%d %s experts to %s with %d worker%s", + bid, + wid, + qtype.name, + worker_count, + "" if worker_count == 1 else "s", + ) + + if worker_count == 1: + for done, xid in enumerate(sorted(group), start=1): + name, model_scale_name = group[xid] + experts[xid] = convert_one(name, model_scale_name, qtype) + if done % 32 == 0 or done == n_experts: + logger.info("DeepSeek V4: blk.%d %s %d/%d experts", bid, wid, done, n_experts) + else: + max_pending = worker_count * 2 + pending: dict[concurrent.futures.Future[np.ndarray], int] = {} + xids = iter(sorted(group)) + done = 0 + + with concurrent.futures.ThreadPoolExecutor(max_workers=worker_count) as executor: + def submit_next() -> bool: + try: + xid = next(xids) + except StopIteration: + return False + name, model_scale_name = group[xid] + future = executor.submit(convert_one, name, model_scale_name, qtype) + pending[future] = xid + return True + + while len(pending) < max_pending and submit_next(): + pass + + while pending: + finished, _ = concurrent.futures.wait( + pending, + return_when=concurrent.futures.FIRST_COMPLETED, + ) + for future in finished: + xid = pending.pop(future) + experts[xid] = future.result() + done += 1 + if done % 32 == 0 or done == n_experts: + logger.info("DeepSeek V4: blk.%d %s %d/%d experts", bid, wid, done, n_experts) + submit_next() + + add_merged_tensor(bid, wid, qtype, experts) + + return consumed + + def _prepare_deepseek4_scaled_tensors(self) -> None: + block_size = (self.hparams.get("quantization_config") or {}).get("weight_block_size", [128, 128]) + consumed: set[str] = set() + + for name in list(self.model_tensors.keys()): + stripped = self._strip_model_prefix(name) + if stripped.startswith("mtp.") or self._skip_layer_tensor(stripped): + consumed.add(name) + + consumed.update(self._write_deepseek4_tid2eid_tensors()) + consumed.update(self._write_deepseek4_expert_tensors()) + + for name in list(self.model_tensors.keys()): + if name in consumed: + continue + stripped = self._strip_model_prefix(name) + if not stripped.endswith(".scale"): + continue + if re.match(r"layers\.\d+\.ffn\.experts\.\d+\.w[123]\.scale$", stripped) is not None: + continue + + weight_name = f"{stripped.removesuffix('.scale')}.weight" + model_weight_name = weight_name if weight_name in self.model_tensors else f"model.{weight_name}" + if model_weight_name not in self.model_tensors: + raise ValueError(f"Missing DeepSeek V4 FP8 weight tensor for scale {stripped}") + + w = self.model_tensors[model_weight_name] + s = self.model_tensors[name] + self.model_tensors[model_weight_name] = ( + lambda w=w, s=s, bs=block_size: self._dequant_fp8_weight( + LazyTorchTensor.to_eager(w()), + LazyTorchTensor.to_eager(s()), + bs, + ) + ) + consumed.add(name) + + for name in consumed: + self.model_tensors.pop(name, None) + + def prepare_tensors(self): + self._prepare_deepseek4_scaled_tensors() + + if any(name.endswith(".scale") for name in self.model_tensors): + raise NotImplementedError("Unhandled DeepSeek V4 scale tensors remain after conversion preparation") + + super().prepare_tensors() + + if self._experts is not None: + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: + del name + del new_name + del bid + + if not self._is_low_bit_ftype(self.ftype) or n_dims <= 1: + return False + + # DeepSeek V4 routed experts are handled in _write_deepseek4_expert_tensors(), + # where each expert is converted directly from FP4 to the requested compact + # GGUF type. Keep the rest of the model in float form so attention, + # hyper-connections, indexers, compressors, shared experts and logits do not + # inherit the global low-bit file type. + return gguf.GGMLQuantizationType.F16 + + def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: + mapped = self._map_tensor_name_deepseek4(name) + if mapped is not None: + return mapped + return super().map_tensor_name(name, try_suffixes) + + def _map_tensor_name_deepseek4(self, name: str) -> str | None: + if name.startswith("model."): + name = name.removeprefix("model.") + + top_level: dict[str, tuple[gguf.MODEL_TENSOR, str]] = { + "embed.weight": (gguf.MODEL_TENSOR.TOKEN_EMBD, ".weight"), + "norm.weight": (gguf.MODEL_TENSOR.OUTPUT_NORM, ".weight"), + "head.weight": (gguf.MODEL_TENSOR.OUTPUT, ".weight"), + "hc_head_base": (gguf.MODEL_TENSOR.OUTPUT_HC_BASE, ".weight"), + "hc_head_fn": (gguf.MODEL_TENSOR.OUTPUT_HC_FN, ".weight"), + "hc_head_scale": (gguf.MODEL_TENSOR.OUTPUT_HC_SCALE, ".weight"), + } + if name in top_level: + tensor, suffix = top_level[name] + return self.format_tensor_name(tensor, suffix=suffix) + + match = re.match(r"layers\.(\d+)\.(.+)", name) + if match is None: + return None + + bid = int(match.group(1)) + rest = match.group(2) + + layer_level: dict[str, tuple[gguf.MODEL_TENSOR, str]] = { + "hc_attn_base": (gguf.MODEL_TENSOR.HC_ATTN_BASE, ".weight"), + "hc_attn_fn": (gguf.MODEL_TENSOR.HC_ATTN_FN, ".weight"), + "hc_attn_scale": (gguf.MODEL_TENSOR.HC_ATTN_SCALE, ".weight"), + "hc_ffn_base": (gguf.MODEL_TENSOR.HC_FFN_BASE, ".weight"), + "hc_ffn_fn": (gguf.MODEL_TENSOR.HC_FFN_FN, ".weight"), + "hc_ffn_scale": (gguf.MODEL_TENSOR.HC_FFN_SCALE, ".weight"), + "attn.attn_sink": (gguf.MODEL_TENSOR.ATTN_SINKS, ".weight"), + "attn.wq_a.weight": (gguf.MODEL_TENSOR.ATTN_Q_A, ".weight"), + "attn.wq_b.weight": (gguf.MODEL_TENSOR.ATTN_Q_B, ".weight"), + "attn.q_norm.weight": (gguf.MODEL_TENSOR.ATTN_Q_A_NORM, ".weight"), + "attn.wkv.weight": (gguf.MODEL_TENSOR.ATTN_KV, ".weight"), + "attn.kv_norm.weight": (gguf.MODEL_TENSOR.ATTN_KV_A_NORM, ".weight"), + "attn.wo_a.weight": (gguf.MODEL_TENSOR.ATTN_OUT_A, ".weight"), + "attn.wo_b.weight": (gguf.MODEL_TENSOR.ATTN_OUT_B, ".weight"), + "attn.compressor.ape": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_APE, ".weight"), + "attn.compressor.wkv.weight": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_KV, ".weight"), + "attn.compressor.wgate.weight": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_GATE, ".weight"), + "attn.compressor.norm.weight": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_NORM, ".weight"), + "attn.indexer.wq_b.weight": (gguf.MODEL_TENSOR.INDEXER_ATTN_Q_B, ".weight"), + "attn.indexer.weights_proj.weight": (gguf.MODEL_TENSOR.INDEXER_PROJ, ".weight"), + "attn.indexer.compressor.ape": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_APE, ".weight"), + "attn.indexer.compressor.wkv.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_KV, ".weight"), + "attn.indexer.compressor.wgate.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_GATE, ".weight"), + "attn.indexer.compressor.norm.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_NORM, ".weight"), + "attn_norm.weight": (gguf.MODEL_TENSOR.ATTN_NORM, ".weight"), + "ffn_norm.weight": (gguf.MODEL_TENSOR.FFN_NORM, ".weight"), + "ffn.shared_experts.w1.weight": (gguf.MODEL_TENSOR.FFN_GATE_SHEXP, ".weight"), + "ffn.shared_experts.w3.weight": (gguf.MODEL_TENSOR.FFN_UP_SHEXP, ".weight"), + "ffn.shared_experts.w2.weight": (gguf.MODEL_TENSOR.FFN_DOWN_SHEXP, ".weight"), + "ffn.gate.weight": (gguf.MODEL_TENSOR.FFN_GATE_INP, ".weight"), + "ffn.gate.bias": (gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, ".bias"), + "ffn.gate.tid2eid": (gguf.MODEL_TENSOR.FFN_GATE_TID2EID, ".weight"), + "ffn.experts.w1.weight": (gguf.MODEL_TENSOR.FFN_GATE_EXP, ".weight"), + "ffn.experts.w3.weight": (gguf.MODEL_TENSOR.FFN_UP_EXP, ".weight"), + "ffn.experts.w2.weight": (gguf.MODEL_TENSOR.FFN_DOWN_EXP, ".weight"), + } + if rest in layer_level: + tensor, suffix = layer_level[rest] + return self.format_tensor_name(tensor, bid, suffix=suffix) + + return None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("model."): + name = name.removeprefix("model.") + + # TODO: llama.cpp does not have Multi-Token Prediction for DeepSeek yet. + if name.startswith("mtp."): + return + + # process the experts separately + match = re.match(r"layers\.(\d+)\.ffn\.experts\.(\d+)\.(w[123])\.weight", name) + if match is not None: + bid = int(match.group(1)) + xid = int(match.group(2)) + wid = match.group(3) + n_experts = self.hparams["n_routed_experts"] + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + for w_name in ["w1", "w3", "w2"]: + datas: list[Tensor] = [] + + for expert_id in range(n_experts): + ename = f"layers.{bid}.ffn.experts.{expert_id}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + merged_name = f"layers.{bid}.ffn.experts.{w_name}.weight" + yield self.map_tensor_name(merged_name), data_torch + return + + del xid, wid + return + + yield self.map_tensor_name(name), data_torch diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 41566d41aef3..f71943ed33aa 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -567,6 +567,11 @@ extern "C" { GGML_OP_RWKV_WKV7, GGML_OP_SOLVE_TRI, GGML_OP_GATED_DELTA_NET, + GGML_OP_DSV4_HC_SPLIT_SINKHORN, + GGML_OP_DSV4_HC_WEIGHTED_SUM, + GGML_OP_DSV4_HC_EXPAND, + GGML_OP_DSV4_FP8_KV_QUANTIZE, + GGML_OP_DSV4_ROPE_TAIL, GGML_OP_UNARY, @@ -2555,6 +2560,61 @@ extern "C" { struct ggml_tensor * beta, struct ggml_tensor * state); + // DeepSeek V4 hyperconnection helper. + // Splits [mix, tokens] into pre/post/comb regions and applies the + // Sinkhorn normalization used by the reference implementation. + GGML_API struct ggml_tensor * ggml_dsv4_hc_split_sinkhorn( + struct ggml_context * ctx, + struct ggml_tensor * mixes, + struct ggml_tensor * scale, + struct ggml_tensor * base, + int n_hc, + int sinkhorn_iters, + float eps); + + // DeepSeek V4 hyperconnection weighted-sum helper. + // Computes sum_hc weights[hc, token] * x[embd, hc, token]. + GGML_API struct ggml_tensor * ggml_dsv4_hc_weighted_sum( + struct ggml_context * ctx, + struct ggml_tensor * x, + struct ggml_tensor * weights); + + // DeepSeek V4 hyperconnection expand helper. + // Computes post * block_out + comb^T @ residual for each token. + GGML_API struct ggml_tensor * ggml_dsv4_hc_expand( + struct ggml_context * ctx, + struct ggml_tensor * block_out, + struct ggml_tensor * residual, + struct ggml_tensor * post, + struct ggml_tensor * comb); + + // DeepSeek V4 FP8 KV-cache simulation helper. + // Quantizes/dequantizes the non-RoPE prefix in E4M3FN blocks and leaves + // the RoPE tail unchanged, matching the reference inference path. + GGML_API struct ggml_tensor * ggml_dsv4_fp8_kv_quantize( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_rot); + + // DeepSeek V4 partial RoPE helper. + // Leaves the non-RoPE prefix unchanged and applies RoPE to the tail, + // matching ggml_concat(prefix, ggml_rope_ext(tail)). + GGML_API struct ggml_tensor * ggml_dsv4_rope_tail( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pos, + struct ggml_tensor * freq_factors, + int n_dims, + int mode, + int n_ctx_orig, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow, + bool inverse); + // custom operators typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata); diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 4e36909f45e9..d51620c288a6 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -754,7 +754,16 @@ static bool ggml_is_view_op(enum ggml_op op) { #endif #ifndef GGML_SCHED_MAX_SPLIT_INPUTS -#define GGML_SCHED_MAX_SPLIT_INPUTS 30 +// V4 multi-GPU note: V4 (DeepSeek-V4) graphs need a higher value (~80-128) +// when split across multiple devices, due to dense per-layer inputs +// (hyperconnection × 4 + indexer/compressor state + multiple KV caches). +// Single-device runs never trip the cap. The constant gates not just +// `sched_split.inputs[N]` (small) but also `nodes_size` and +// `context_buffer_size` allocations that scale as `graph_size × N` — +// bumping the default adds ~200 MB per scheduler instance for V4-sized +// graphs, paid even by single-GPU users who don't need it. We bump it +// anyway for the DSv4-Flash use-case (Strix Halo / unified memory). +#define GGML_SCHED_MAX_SPLIT_INPUTS 128 #endif #ifndef GGML_SCHED_MAX_COPIES diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index cd5c61a81879..70f8def3a742 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2047,6 +2047,26 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_gated_delta_net(params, tensor); } break; + case GGML_OP_DSV4_HC_SPLIT_SINKHORN: + { + ggml_compute_forward_dsv4_hc_split_sinkhorn(params, tensor); + } break; + case GGML_OP_DSV4_HC_WEIGHTED_SUM: + { + ggml_compute_forward_dsv4_hc_weighted_sum(params, tensor); + } break; + case GGML_OP_DSV4_HC_EXPAND: + { + ggml_compute_forward_dsv4_hc_expand(params, tensor); + } break; + case GGML_OP_DSV4_FP8_KV_QUANTIZE: + { + ggml_compute_forward_dsv4_fp8_kv_quantize(params, tensor); + } break; + case GGML_OP_DSV4_ROPE_TAIL: + { + ggml_compute_forward_dsv4_rope_tail(params, tensor); + } break; case GGML_OP_MAP_CUSTOM1: { ggml_compute_forward_map_custom1(params, tensor); @@ -2227,6 +2247,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_COUNT_EQUAL: case GGML_OP_SOLVE_TRI: case GGML_OP_GATED_DELTA_NET: + case GGML_OP_DSV4_HC_SPLIT_SINKHORN: + case GGML_OP_DSV4_HC_WEIGHTED_SUM: + case GGML_OP_DSV4_HC_EXPAND: + case GGML_OP_DSV4_FP8_KV_QUANTIZE: + case GGML_OP_DSV4_ROPE_TAIL: { n_tasks = n_threads; } break; @@ -2847,6 +2872,7 @@ struct ggml_cplan ggml_graph_plan( case GGML_OP_SOFT_MAX: case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: + case GGML_OP_DSV4_ROPE_TAIL: { cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; } break; diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 7485ba4fc861..f473cb724725 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -5970,6 +5970,127 @@ void ggml_compute_forward_rope_back( } } +// ggml_compute_forward_dsv4_rope_tail + +template +static void ggml_compute_forward_dsv4_rope_tail_flt( + const ggml_compute_params * params, + ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + const ggml_tensor * src2 = dst->src[2]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_I32); + + const int n_dims = ((int32_t *) dst->op_params)[0]; + const int mode = ((int32_t *) dst->op_params)[1]; + const int n_ctx_orig = ((int32_t *) dst->op_params)[2]; + const bool inverse = ((int32_t *) dst->op_params)[3] != 0; + + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 9, sizeof(float)); + + GGML_TENSOR_UNARY_OP_LOCALS + + GGML_ASSERT(nb0 == nb00); + GGML_ASSERT(nb0 == sizeof(T)); + GGML_ASSERT(n_dims <= ne0); + GGML_ASSERT(n_dims % 2 == 0); + GGML_ASSERT(mode == GGML_ROPE_TYPE_NORMAL || mode == GGML_ROPE_TYPE_NEOX); + + const int64_t n_nope = ne0 - n_dims; + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(dst); + const int dr = (nr + nth - 1)/nth; + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + const float theta_scale = powf(freq_base, -2.0f/n_dims); + + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); + + const float * freq_factors = NULL; + if (src2 != NULL) { + GGML_ASSERT(src2->type == GGML_TYPE_F32); + GGML_ASSERT(src2->ne[0] >= n_dims / 2); + freq_factors = (const float *) src2->data; + } + + const float sin_sign = inverse ? -1.0f : 1.0f; + const int32_t * pos = (const int32_t *) src1->data; + + int ir = 0; + int64_t last_i2 = -1; + + for (int64_t i3 = 0; i3 < ne3; i3++) { + for (int64_t i2 = 0; i2 < ne2; i2++) { + for (int64_t i1 = 0; i1 < ne1; i1++) { + if (ir++ < ir0) continue; + if (ir > ir1) break; + + float * cache = (float *) params->wdata + (n_dims + CACHE_LINE_SIZE_F32)*ith; + if (last_i2 != i2) { + const int64_t p = pos[i2]; + ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, n_dims, ext_factor, attn_factor, cache, sin_sign, theta_scale); + last_i2 = i2; + } + + const T * src = (const T *)((const char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + T * dst_data = (T *)(( char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + + for (int64_t i0 = 0; i0 < n_nope; ++i0) { + dst_data[i0] = src[i0]; + } + + const T * src_tail = src + n_nope; + T * dst_tail = dst_data + n_nope; + + switch (mode) { + case GGML_ROPE_TYPE_NORMAL: + rotate_pairs(n_dims, 1, cache, src_tail, dst_tail, 1); + break; + case GGML_ROPE_TYPE_NEOX: + rotate_pairs(n_dims, n_dims/2, cache, src_tail, dst_tail); + break; + default: + GGML_ABORT("rope type not supported"); + } + } + } + } +} + +void ggml_compute_forward_dsv4_rope_tail( + const ggml_compute_params * params, + ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_dsv4_rope_tail_flt(params, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_dsv4_rope_tail_flt(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_conv_transpose_1d static void ggml_compute_forward_conv_transpose_1d_f16_f32( @@ -10903,6 +11024,343 @@ void ggml_compute_forward_rwkv_wkv7( } } +// ggml_compute_forward_dsv4_hc_split_sinkhorn + +void ggml_compute_forward_dsv4_hc_split_sinkhorn( + const ggml_compute_params * params, + ggml_tensor * dst) { + const ggml_tensor * mixes = dst->src[0]; + const ggml_tensor * scale = dst->src[1]; + const ggml_tensor * base = dst->src[2]; + + GGML_ASSERT(mixes->type == GGML_TYPE_F32); + GGML_ASSERT(scale->type == GGML_TYPE_F32); + GGML_ASSERT(base->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(mixes->nb[0] == sizeof(float)); + GGML_ASSERT(scale->nb[0] == sizeof(float)); + GGML_ASSERT(base->nb[0] == sizeof(float)); + GGML_ASSERT(dst->nb[0] == sizeof(float)); + + const int n_hc = ggml_get_op_params_i32(dst, 0); + const int sinkhorn_iters = ggml_get_op_params_i32(dst, 1); + const float eps = ggml_get_op_params_f32(dst, 2); + const int64_t mix_hc = mixes->ne[0]; + const int64_t n_rows = ggml_nrows(mixes); + + GGML_ASSERT(n_hc > 0 && n_hc <= 16); + GGML_ASSERT(sinkhorn_iters > 0); + GGML_ASSERT(mix_hc == (2 + n_hc) * n_hc); + GGML_ASSERT(ggml_nrows(dst) == n_rows); + + const float * scale_data = (const float *) scale->data; + const float * base_data = (const float *) base->data; + + const float pre_scale = scale_data[0]; + const float post_scale = scale_data[1]; + const float comb_scale = scale_data[2]; + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t dr = (n_rows + nth - 1) / nth; + const int64_t r0 = dr * ith; + const int64_t r1 = std::min(r0 + dr, n_rows); + + for (int64_t r = r0; r < r1; ++r) { + const float * mix = (const float *) ((const char *) mixes->data + r*mixes->nb[1]); + float * out = (float *) ((char *) dst->data + r*dst->nb[1]); + + for (int i = 0; i < n_hc; ++i) { + const float z = mix[i] * pre_scale + base_data[i]; + out[i] = 1.0f / (1.0f + expf(-z)) + eps; + } + + for (int i = 0; i < n_hc; ++i) { + const int off = n_hc + i; + const float z = mix[off] * post_scale + base_data[off]; + out[off] = 2.0f / (1.0f + expf(-z)); + } + + float c[16*16]; + + // comb is laid out as a flat [n_hc*n_hc] block per token, written as + // c[src_hc + dst_hc*n_hc]; after the graph's reshape_3d this is ggml + // tensor comb[ne0=src_hc, ne1=dst_hc, ne2=token]. The Sinkhorn pass + // below softmaxes over src_hc, then alternates row/col normalization. + // ggml_dsv4_hc_expand intentionally reads ggml-dim0 as dst_hc, which + // transposes this matrix on read so it computes comb^T @ residual + // (the V4 hyperconnection contract). CPU/Metal/CUDA use the identical + // flat write + transposed read; do not "fix" one side in isolation. + for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) { + float row_max = -INFINITY; + for (int src_hc = 0; src_hc < n_hc; ++src_hc) { + const int idx = src_hc + dst_hc*n_hc; + const int off = 2*n_hc + idx; + const float v = mix[off] * comb_scale + base_data[off]; + c[idx] = v; + row_max = std::max(row_max, v); + } + + float row_sum = 0.0f; + for (int src_hc = 0; src_hc < n_hc; ++src_hc) { + const int idx = src_hc + dst_hc*n_hc; + const float v = expf(c[idx] - row_max); + c[idx] = v; + row_sum += v; + } + + const float inv_sum = 1.0f / row_sum; + for (int src_hc = 0; src_hc < n_hc; ++src_hc) { + const int idx = src_hc + dst_hc*n_hc; + c[idx] = c[idx] * inv_sum + eps; + } + } + + for (int src_hc = 0; src_hc < n_hc; ++src_hc) { + float sum = 0.0f; + for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) { + sum += c[src_hc + dst_hc*n_hc]; + } + + const float inv_denom = 1.0f / (sum + eps); + for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) { + c[src_hc + dst_hc*n_hc] *= inv_denom; + } + } + + for (int iter = 1; iter < sinkhorn_iters; ++iter) { + for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) { + float sum = 0.0f; + for (int src_hc = 0; src_hc < n_hc; ++src_hc) { + sum += c[src_hc + dst_hc*n_hc]; + } + + const float inv_denom = 1.0f / (sum + eps); + for (int src_hc = 0; src_hc < n_hc; ++src_hc) { + c[src_hc + dst_hc*n_hc] *= inv_denom; + } + } + + for (int src_hc = 0; src_hc < n_hc; ++src_hc) { + float sum = 0.0f; + for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) { + sum += c[src_hc + dst_hc*n_hc]; + } + + const float inv_denom = 1.0f / (sum + eps); + for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) { + c[src_hc + dst_hc*n_hc] *= inv_denom; + } + } + } + + for (int i = 0; i < n_hc*n_hc; ++i) { + out[2*n_hc + i] = c[i]; + } + } +} + +// ggml_compute_forward_dsv4_hc_weighted_sum + +void ggml_compute_forward_dsv4_hc_weighted_sum( + const ggml_compute_params * params, + ggml_tensor * dst) { + const ggml_tensor * x = dst->src[0]; + const ggml_tensor * weights = dst->src[1]; + + GGML_ASSERT(x->type == GGML_TYPE_F32); + GGML_ASSERT(weights->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(x->ne[0] == dst->ne[0]); + GGML_ASSERT(x->ne[1] == weights->ne[0]); + GGML_ASSERT(x->ne[2] == dst->ne[1]); + GGML_ASSERT(weights->ne[1] == dst->ne[1]); + GGML_ASSERT(x->ne[3] == 1); + GGML_ASSERT(weights->ne[2] == 1); + GGML_ASSERT(weights->ne[3] == 1); + GGML_ASSERT(dst->ne[2] == 1); + GGML_ASSERT(dst->ne[3] == 1); + + const int64_t n_embd = dst->ne[0]; + const int64_t n_hc = x->ne[1]; + const int64_t n_tokens = dst->ne[1]; + const int64_t n_elem = n_embd * n_tokens; + + const int64_t i0 = (n_elem * params->ith) / params->nth; + const int64_t i1 = (n_elem * (params->ith + 1)) / params->nth; + + const char * x_data = (const char *) x->data; + const char * w_data = (const char *) weights->data; + char * y_data = ( char *) dst->data; + + for (int64_t i = i0; i < i1; ++i) { + const int64_t d = i % n_embd; + const int64_t t = i / n_embd; + + float acc = 0.0f; + for (int64_t h = 0; h < n_hc; ++h) { + const float xv = *(const float *) (x_data + d*x->nb[0] + h*x->nb[1] + t*x->nb[2]); + const float wv = *(const float *) (w_data + h*weights->nb[0] + t*weights->nb[1]); + acc += xv * wv; + } + + *(float *) (y_data + d*dst->nb[0] + t*dst->nb[1]) = acc; + } +} + +// ggml_compute_forward_dsv4_hc_expand + +void ggml_compute_forward_dsv4_hc_expand( + const ggml_compute_params * params, + ggml_tensor * dst) { + const ggml_tensor * block_out = dst->src[0]; + const ggml_tensor * residual = dst->src[1]; + const ggml_tensor * post = dst->src[2]; + const ggml_tensor * comb = dst->src[3]; + + GGML_ASSERT(block_out->type == GGML_TYPE_F32); + GGML_ASSERT(residual->type == GGML_TYPE_F32); + GGML_ASSERT(post->type == GGML_TYPE_F32); + GGML_ASSERT(comb->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(block_out->ne[0] == dst->ne[0]); + GGML_ASSERT(block_out->ne[1] == dst->ne[2]); + GGML_ASSERT(residual->ne[0] == dst->ne[0]); + GGML_ASSERT(residual->ne[1] == dst->ne[1]); + GGML_ASSERT(residual->ne[2] == dst->ne[2]); + GGML_ASSERT(post->ne[0] == dst->ne[1]); + GGML_ASSERT(post->ne[1] == dst->ne[2]); + GGML_ASSERT(comb->ne[0] == dst->ne[1]); + GGML_ASSERT(comb->ne[1] == dst->ne[1]); + GGML_ASSERT(comb->ne[2] == dst->ne[2]); + GGML_ASSERT(block_out->ne[3] == 1); + GGML_ASSERT(residual->ne[3] == 1); + GGML_ASSERT(post->ne[2] == 1); + GGML_ASSERT(post->ne[3] == 1); + GGML_ASSERT(comb->ne[3] == 1); + GGML_ASSERT(dst->ne[3] == 1); + + const int64_t n_embd = dst->ne[0]; + const int64_t n_hc = dst->ne[1]; + const int64_t n_tokens = dst->ne[2]; + const int64_t n_elem = n_embd * n_hc * n_tokens; + + const int64_t i0 = (n_elem * params->ith) / params->nth; + const int64_t i1 = (n_elem * (params->ith + 1)) / params->nth; + + const char * block_data = (const char *) block_out->data; + const char * res_data = (const char *) residual->data; + const char * post_data = (const char *) post->data; + const char * comb_data = (const char *) comb->data; + char * dst_data = ( char *) dst->data; + + for (int64_t i = i0; i < i1; ++i) { + const int64_t d = i % n_embd; + const int64_t tmp = i / n_embd; + const int64_t dst_hc = tmp % n_hc; + const int64_t t = tmp / n_hc; + + const float block_v = *(const float *) (block_data + d*block_out->nb[0] + t*block_out->nb[1]); + const float post_v = *(const float *) (post_data + dst_hc*post->nb[0] + t*post->nb[1]); + + float acc = block_v * post_v; + // comb arrives as comb[ne0=src_hc, ne1=dst_hc, ne2=t] from + // dsv4_hc_split_sinkhorn (flat write src_hc + dst_hc*n_hc). Reading + // ne0 as dst_hc and ne1 as src_hc here transposes it, giving + // (comb^T @ residual)[d, dst_hc] = sum_src_hc comb[src_hc,dst_hc,t] + // * residual[d, src_hc, t]. This transpose is deliberate and matches + // the Metal/CUDA expand kernels (validated 19/19 vs this CPU oracle). + for (int64_t src_hc = 0; src_hc < n_hc; ++src_hc) { + const float comb_v = *(const float *) (comb_data + dst_hc*comb->nb[0] + src_hc*comb->nb[1] + t*comb->nb[2]); + const float res_v = *(const float *) (res_data + d*residual->nb[0] + src_hc*residual->nb[1] + t*residual->nb[2]); + acc += comb_v * res_v; + } + + *(float *) (dst_data + d*dst->nb[0] + dst_hc*dst->nb[1] + t*dst->nb[2]) = acc; + } +} + +static float ggml_dsv4_e4m3fn_dequant(float x) { + const float sign = x < 0.0f ? -1.0f : 1.0f; + const float ax = std::min(std::fabs(x), 448.0f); + + int best = 0; + float best_diff = ax; + + for (int i = 1; i < 127; ++i) { + const int exp = (i >> 3) & 0x0f; + const int mant = i & 0x07; + const float val = exp == 0 + ? std::ldexp(float(mant), -9) + : std::ldexp(1.0f + float(mant) / 8.0f, exp - 7); + const float diff = std::fabs(ax - val); + if (diff < best_diff || (diff == best_diff && (i & 1) == 0 && (best & 1) != 0)) { + best = i; + best_diff = diff; + } + } + + const int exp = (best >> 3) & 0x0f; + const int mant = best & 0x07; + const float val = exp == 0 + ? std::ldexp(float(mant), -9) + : std::ldexp(1.0f + float(mant) / 8.0f, exp - 7); + + return sign * val; +} + +void ggml_compute_forward_dsv4_fp8_kv_quantize( + const ggml_compute_params * params, + ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + const int64_t n_rot = ggml_get_op_params_i32(dst, 0); + const int64_t head_dim = src0->ne[0]; + const int64_t n_nope = head_dim - n_rot; + + GGML_ASSERT(n_rot >= 0); + GGML_ASSERT(n_nope > 0); + GGML_ASSERT(n_nope % 64 == 0); + + const int64_t n_rows = src0->ne[1] * src0->ne[2] * src0->ne[3]; + const int64_t row_start = (n_rows * params->ith) / params->nth; + const int64_t row_end = (n_rows * (params->ith + 1)) / params->nth; + + for (int64_t row = row_start; row < row_end; ++row) { + const int64_t i1 = row % src0->ne[1]; + const int64_t i2 = (row / src0->ne[1]) % src0->ne[2]; + const int64_t i3 = row / (src0->ne[1] * src0->ne[2]); + + const char * src_base = (const char *) src0->data + i1*src0->nb[1] + i2*src0->nb[2] + i3*src0->nb[3]; + char * dst_base = ( char *) dst->data + i1*dst->nb[1] + i2*dst->nb[2] + i3*dst->nb[3]; + + for (int64_t off = 0; off < n_nope; off += 64) { + float amax = 0.0f; + for (int64_t i = 0; i < 64; ++i) { + const float v = *(const float *) (src_base + (off + i)*src0->nb[0]); + amax = std::max(amax, std::fabs(v)); + } + + amax = std::max(amax, 1.0e-4f); + const float scale = std::ldexp(1.0f, int(std::ceil(std::log2(amax / 448.0f)))); + for (int64_t i = 0; i < 64; ++i) { + const float v = *(const float *) (src_base + (off + i)*src0->nb[0]); + *(float *) (dst_base + (off + i)*dst->nb[0]) = + ggml_dsv4_e4m3fn_dequant(std::clamp(v / scale, -448.0f, 448.0f)) * scale; + } + } + + for (int64_t i = n_nope; i < head_dim; ++i) { + *(float *) (dst_base + i*dst->nb[0]) = *(const float *) (src_base + i*src0->nb[0]); + } + } +} + // ggml_compute_forward_map_custom1 void ggml_compute_forward_map_custom1( diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h index 7398e5618948..4da4db62aa4e 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h @@ -104,6 +104,11 @@ void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, s void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_gated_delta_net(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_dsv4_hc_split_sinkhorn(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_dsv4_hc_weighted_sum(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_dsv4_hc_expand(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_dsv4_fp8_kv_quantize(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_dsv4_rope_tail(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/dsv4-fp8-kv-quantize.cu b/ggml/src/ggml-cuda/dsv4-fp8-kv-quantize.cu new file mode 100644 index 000000000000..d32049c97da0 --- /dev/null +++ b/ggml/src/ggml-cuda/dsv4-fp8-kv-quantize.cu @@ -0,0 +1,164 @@ +#include "dsv4-fp8-kv-quantize.cuh" + +#if !defined(__HIP_PLATFORM_AMD__) && __CUDA_ARCH__ >= 890 +#include +#endif + +#include + +// E4M3FN code value: 0..127. +// Format: 1 sign + 4 exponent + 3 mantissa, bias 7, no inf/nan reserved. +// (i >> 3) & 0xf = exponent, i & 7 = mantissa. Code 0 is +0. +// Mirrors the CPU helper at ggml-cpu/ops.cpp:11245-11247 and the Metal +// helper dsv4_e4m3fn_value at ggml-metal.metal:2302-2308. +static __device__ __forceinline__ float dsv4_e4m3fn_value(int i) { + const int e = (i >> 3) & 0x0f; + const int m = i & 0x07; + return e == 0 + ? float(m) * 0.001953125f // 2^-9 * m (subnormal) + : (1.0f + float(m) * 0.125f) * exp2f(float(e - 7)); // normal +} + +// Round |x| to the nearest E4M3FN positive code value, breaking ties +// toward the EVEN code (matches CPU reference ops.cpp:11242-11253 exactly). +// Returns the dequantized F32, sign-preserved. +static __device__ __forceinline__ float dsv4_e4m3fn_dequant_sw(float x) { + const float sign = x < 0.0f ? -1.0f : 1.0f; + const float ax = fminf(fabsf(x), 448.0f); + + int best = 0; + float best_diff = ax; + #pragma unroll + for (int i = 1; i < 127; ++i) { + const float val = dsv4_e4m3fn_value(i); + const float diff = fabsf(ax - val); + if (diff < best_diff || (diff == best_diff && (i & 1) == 0 && (best & 1) != 0)) { + best = i; + best_diff = diff; + } + } + return sign * dsv4_e4m3fn_value(best); +} + +// Dual-path E4M3FN quantize+dequantize round-trip with saturation. +// +// Native path uses NVIDIA's documented FP8 class API. The constructor +// __nv_fp8_e4m3(float) applies round-to-nearest-even and saturates to +// the finite E4M3 range (+/-448). The explicit float() conversion expands +// the FP8 storage back to F32. This is the supported public API per +// NVIDIA's cuda_fp8.h headers (CUDA toolkit >= 11.8). +// +// (We intentionally avoid the lower-level __nv_cvt_fp8_to_halfraw + +// __half2float chain: the class wrapper is clearer and avoids a half +// hop on F32-only data. There is no __nv_cvt_fp8_to_float intrinsic.) +static __device__ __forceinline__ float dsv4_e4m3fn_roundtrip(float x) { +#if !defined(__HIP_PLATFORM_AMD__) && __CUDA_ARCH__ >= 890 + const __nv_fp8_e4m3 q(x); + return float(q); +#else + // Software emulation: matches CPU reference bit-for-bit. + return dsv4_e4m3fn_dequant_sw(x); +#endif +} + +// Warp-level (32 threads) max-reduction via __shfl_xor_sync. +static __device__ __forceinline__ float warp_reduce_max(float v) { + #pragma unroll + for (int offset = 16; offset > 0; offset >>= 1) { + v = fmaxf(v, __shfl_xor_sync(0xffffffffu, v, offset, 32)); + } + return v; +} + +// One block per row. blockDim.x == 64 (two warps). +static __global__ void dsv4_fp8_kv_quantize_f32( + const char * __restrict__ src, + char * __restrict__ dst, + const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, + const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03, + const int64_t nb0, const int64_t nb1, const int64_t nb2, const int64_t nb3, + const int n_rot) { + + const int64_t n_rows = ne01 * ne02 * ne03; + const int64_t row = blockIdx.x; + if (row >= n_rows) return; + + const int tid = threadIdx.x; // 0..63 + const int warp_id = tid >> 5; // 0 or 1 + const int lane = tid & 31; + + const int64_t i1 = row % ne01; + const int64_t i2 = (row / ne01) % ne02; + const int64_t i3 = row / (ne01 * ne02); + + const char * src_base = src + i1*nb01 + i2*nb02 + i3*nb03; + char * dst_base = dst + i1*nb1 + i2*nb2 + i3*nb3; + + const int64_t n_nope = ne00 - (int64_t) n_rot; + + // Shared-mem slot for the two warps' partial max. + __shared__ float warp_max[2]; + + // Prefix loop: 64-element blocks. + for (int64_t off = 0; off < n_nope; off += 64) { + const float v = *(const float *)(src_base + (off + tid) * nb00); + + // Two-stage block-max reduction across 64 threads. + // Stage 1: each warp reduces its 32 lanes via shfl_xor; lane 0 stores + // the warp's max to shared memory. + // Stage 2: a single thread (warp 0, lane 0) combines the two warp maxes + // and writes the final block max back to warp_max[0]. + float m = warp_reduce_max(fabsf(v)); + if (lane == 0) warp_max[warp_id] = m; + __syncthreads(); + if (warp_id == 0 && lane == 0) { + warp_max[0] = fmaxf(warp_max[0], warp_max[1]); + } + __syncthreads(); + + const float amax = fmaxf(warp_max[0], 1.0e-4f); + const float scale = exp2f(ceilf(log2f(amax / 448.0f))); + + const float q = dsv4_e4m3fn_roundtrip(fminf(fmaxf(v / scale, -448.0f), 448.0f)) * scale; + *(float *)(dst_base + (off + tid) * nb0) = q; + + __syncthreads(); // protect warp_max for the next block + } + + // Tail loop: copy n_rot elements per row through unchanged. + // 64 threads stride through the tail. + for (int64_t i = n_nope + tid; i < ne00; i += 64) { + *(float *)(dst_base + i * nb0) = *(const float *)(src_base + i * nb00); + } +} + +void ggml_cuda_op_dsv4_fp8_kv_quantize(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src = dst->src[0]; + + GGML_ASSERT(src->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_are_same_shape(src, dst)); + + const int n_rot = ggml_get_op_params_i32(dst, 0); + const int64_t head_dim = src->ne[0]; + const int64_t n_nope = head_dim - (int64_t) n_rot; + + GGML_ASSERT(n_rot >= 0); + GGML_ASSERT(n_nope > 0); + GGML_ASSERT(n_nope % 64 == 0); + + const int64_t n_rows = src->ne[1] * src->ne[2] * src->ne[3]; + + const dim3 grid((unsigned) n_rows, 1, 1); + const dim3 block(64, 1, 1); + + cudaStream_t stream = ctx.stream(); + dsv4_fp8_kv_quantize_f32<<>>( + (const char *) src->data, + ( char *) dst->data, + src->ne[0], src->ne[1], src->ne[2], src->ne[3], + (int64_t) src->nb[0], (int64_t) src->nb[1], (int64_t) src->nb[2], (int64_t) src->nb[3], + (int64_t) dst->nb[0], (int64_t) dst->nb[1], (int64_t) dst->nb[2], (int64_t) dst->nb[3], + n_rot); + CUDA_CHECK(cudaGetLastError()); +} diff --git a/ggml/src/ggml-cuda/dsv4-fp8-kv-quantize.cuh b/ggml/src/ggml-cuda/dsv4-fp8-kv-quantize.cuh new file mode 100644 index 000000000000..8e0fd958674d --- /dev/null +++ b/ggml/src/ggml-cuda/dsv4-fp8-kv-quantize.cuh @@ -0,0 +1,36 @@ +#pragma once + +// V4 FP8 KV-cache simulation: quantizes/dequantizes the non-RoPE prefix +// of each row in 64-element blocks through E4M3FN representation with +// per-block scaling; leaves the RoPE tail unchanged. +// +// Block-scaled algorithm (must match CPU reference for the +// test-backend-ops NMSE check): +// for each row (n_rows = ne01 * ne02 * ne03): +// for off in [0, n_nope) step 64: +// amax = max(|src[off..off+64)|, 1e-4) +// scale = 2^ceil(log2(amax / 448)) +// dst[off+i] = dequant_e4m3fn(clamp(src[off+i]/scale, -448, 448)) * scale +// copy src[n_nope..ne00) to dst unchanged // RoPE tail +// +// References: +// CPU reference: ggml/src/ggml-cpu/ops.cpp:11235-11313 +// Metal kernel: ggml/src/ggml-metal/ggml-metal.metal:2302-2376 +// Metal dispatch: ggml/src/ggml-metal/ggml-metal-ops.cpp:1550-1594 +// Public API: ggml/include/ggml.h:2591 (ggml_dsv4_fp8_kv_quantize) +// +// Dual-path implementation: +// - __CUDA_ARCH__ >= 890 (Ada/Hopper/Blackwell): native FP8 via the +// __nv_fp8_e4m3 class wrapper from (round-to-nearest-even, +// saturate-to-finite to +/-448). +// - __CUDA_ARCH__ < 890 (Volta/Turing/Ampere): software emulation by +// nearest-even E4M3FN code search, mirroring the CPU reference. +// +// Both paths produce numerically equivalent output (subject to FP8's +// inherent lossiness). The four test_dsv4_fp8_kv_quantize cases from +// Stream A (tests/test-backend-ops.cpp:8868-8871) validate with +// max_nmse_err = 1e-3. + +#include "common.cuh" + +void ggml_cuda_op_dsv4_fp8_kv_quantize(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/dsv4-hc-expand.cu b/ggml/src/ggml-cuda/dsv4-hc-expand.cu new file mode 100644 index 000000000000..8c4aab67f34d --- /dev/null +++ b/ggml/src/ggml-cuda/dsv4-hc-expand.cu @@ -0,0 +1,92 @@ +#include "dsv4-hc-expand.cuh" + +// out[i_embd, i_hc, i_tok] = post[i_hc, i_tok] * block_out[i_embd, i_tok] +// + sum_{hc'} comb[i_hc, hc', i_tok] * residual[i_embd, hc', i_tok] +// +// block_out is 2D (no hc axis); the post*block_out term is broadcast across hc. +// See ggml/src/ggml-cpu/ops.cpp:11218-11231 for the CPU reference loop body. +static __global__ void dsv4_hc_expand_f32( + const float * __restrict__ block_out, + const float * __restrict__ residual, + const float * __restrict__ post, + const float * __restrict__ comb, + float * __restrict__ dst, + const int n_embd, const int n_hc, const int n_tokens, + // block_out strides (2D -- no hc axis) + const int nb_b0, const int nb_b1, + // residual strides (3D) + const int nb_r0, const int nb_r1, const int nb_r2, + // post strides (2D) + const int nb_p0, const int nb_p1, + // comb strides (3D) + const int nb_c0, const int nb_c1, const int nb_c2, + // dst strides (3D) + const int nb0, const int nb1, const int nb2) { + const int64_t gid = (int64_t)blockIdx.x * blockDim.x + threadIdx.x; + const int64_t total = (int64_t)n_embd * n_hc * n_tokens; + if (gid >= total) { + return; + } + + const int i_embd = gid % n_embd; + const int rest = gid / n_embd; + const int i_hc = rest % n_hc; + const int i_tok = rest / n_hc; + + // post * block_out (block_out is 2D: indexed by (i_embd, i_tok) only) + const float p = *(const float *)((const char *)post + + i_hc * nb_p0 + i_tok * nb_p1); + const float b = *(const float *)((const char *)block_out + + i_embd * nb_b0 + i_tok * nb_b1); + float acc = p * b; + + // comb @ residual: sum over hc' + for (int hc_p = 0; hc_p < n_hc; ++hc_p) { + const float c = *(const float *)((const char *)comb + + i_hc * nb_c0 + hc_p * nb_c1 + i_tok * nb_c2); + const float r = *(const float *)((const char *)residual + + i_embd * nb_r0 + hc_p * nb_r1 + i_tok * nb_r2); + acc += c * r; + } + + float * d = (float *)((char *)dst + + i_embd * nb0 + i_hc * nb1 + i_tok * nb2); + *d = acc; +} + +void ggml_cuda_op_dsv4_hc_expand(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * block_out = dst->src[0]; + const ggml_tensor * residual = dst->src[1]; + const ggml_tensor * post = dst->src[2]; + const ggml_tensor * comb = dst->src[3]; + + GGML_ASSERT(block_out->type == GGML_TYPE_F32); + GGML_ASSERT(residual->type == GGML_TYPE_F32); + GGML_ASSERT(post->type == GGML_TYPE_F32); + GGML_ASSERT(comb->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int n_embd = (int) dst->ne[0]; + const int n_hc = (int) dst->ne[1]; + const int n_tokens = (int) dst->ne[2]; + + const int64_t total = (int64_t)n_embd * n_hc * n_tokens; + constexpr int blk = 256; + const dim3 grid((total + blk - 1) / blk); + const dim3 block(blk); + + cudaStream_t stream = ctx.stream(); + dsv4_hc_expand_f32<<>>( + (const float *) block_out->data, + (const float *) residual->data, + (const float *) post->data, + (const float *) comb->data, + (float *) dst->data, + n_embd, n_hc, n_tokens, + (int) block_out->nb[0], (int) block_out->nb[1], + (int) residual->nb[0], (int) residual->nb[1], (int) residual->nb[2], + (int) post->nb[0], (int) post->nb[1], + (int) comb->nb[0], (int) comb->nb[1], (int) comb->nb[2], + (int) dst->nb[0], (int) dst->nb[1], (int) dst->nb[2]); + CUDA_CHECK(cudaGetLastError()); +} diff --git a/ggml/src/ggml-cuda/dsv4-hc-expand.cuh b/ggml/src/ggml-cuda/dsv4-hc-expand.cuh new file mode 100644 index 000000000000..28f1a0ee8c8c --- /dev/null +++ b/ggml/src/ggml-cuda/dsv4-hc-expand.cuh @@ -0,0 +1,26 @@ +#pragma once + +// V4 hyperconnection expand: per-token mix of block_out and residual. +// +// out[i, hc, tok] = post[hc, tok] * block_out[i, tok] +// + sum_{hc'} comb[hc, hc', tok] * residual[i, hc', tok] +// +// Shapes: +// block_out: 2D {n_embd, n_tokens} -- no hc axis +// residual: 3D {n_embd, n_hc, n_tokens} +// post: 2D {n_hc, n_tokens} +// comb: 3D {n_hc, n_hc, n_tokens} +// dst: 3D {n_embd, n_hc, n_tokens} +// +// Reference Metal kernel: ggml/src/ggml-metal/ggml-metal.metal:2247-2276 +// CPU reference: ggml/src/ggml-cpu/ops.cpp:11200+ +// Public API: ggml/include/ggml.h:2581 (ggml_dsv4_hc_expand) +// Shape constructor: ggml/src/ggml.c:6363-6366 +// +// Embarrassingly parallel: one thread per output element (i_embd, i_hc, i_tok). +// Each thread does an n_hc-wide accumulation for the comb*residual term plus +// one fused multiply-add for the post*block_out term. + +#include "common.cuh" + +void ggml_cuda_op_dsv4_hc_expand(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/dsv4-hc-split-sinkhorn.cu b/ggml/src/ggml-cuda/dsv4-hc-split-sinkhorn.cu new file mode 100644 index 000000000000..39d7833c9b0b --- /dev/null +++ b/ggml/src/ggml-cuda/dsv4-hc-split-sinkhorn.cu @@ -0,0 +1,198 @@ +#include "dsv4-hc-split-sinkhorn.cuh" + +// Maximum n_hc supported (matches CPU reference assert at ops.cpp:11014 and +// the dst comb matrix scratch buffer size below). +#define DSV4_HC_SINKHORN_MAX_N_HC 16 + +// One block per row. Inside the block: +// - threads cooperate (parallel for) on the pre/post slices and the final +// copy of the comb matrix back to dst. +// - tid == 0 runs the n_hc x n_hc Sinkhorn iterations serially. n_hc <= 16 +// so this is at most a few thousand FLOPs per row. +// +// The comb matrix lives in shared memory (sized for the worst case 16x16 +// = 256 floats = 1 KiB per block, well within any device's shared-mem +// budget). +static __global__ void dsv4_hc_split_sinkhorn_f32( + const float * __restrict__ mixes, + const float * __restrict__ scale, + const float * __restrict__ base, + float * __restrict__ dst, + const int n_hc, + const int sinkhorn_iters, + const int n_rows, + const int mix_hc, + const int nb01, // input row stride in bytes + const int nb1, // output row stride in bytes + const float eps) { + const int row = blockIdx.x; + if (row >= n_rows) { + return; + } + + const int tid = threadIdx.x; + const int blksz = blockDim.x; + + const float pre_scale = scale[0]; + const float post_scale = scale[1]; + const float comb_scale = scale[2]; + + const float * row_in = (const float *) ((const char *) mixes + row * nb01); + float * row_out = (float *) ((char *) dst + row * nb1); + + // ---------------- Section 1: pre slice ---------------- + // out[i] = sigmoid(mix[i] * pre_scale + base[i]) + eps + for (int i = tid; i < n_hc; i += blksz) { + const float z = row_in[i] * pre_scale + base[i]; + row_out[i] = 1.0f / (1.0f + expf(-z)) + eps; + } + + // ---------------- Section 2: post slice ---------------- + // out[n_hc + i] = 2 * sigmoid(mix[n_hc + i] * post_scale + base[n_hc + i]) + for (int i = tid; i < n_hc; i += blksz) { + const int off = n_hc + i; + const float z = row_in[off] * post_scale + base[off]; + row_out[off] = 2.0f / (1.0f + expf(-z)); + } + + // ---------------- Section 3: comb matrix Sinkhorn ---------------- + // + // c[src_hc + dst_hc * n_hc] layout (matches CPU reference at + // ggml-cpu/ops.cpp:11055). + extern __shared__ float shmem[]; + float * c = shmem; // n_hc * n_hc floats + + // Load the comb logits = mix * comb_scale + base (parallel over the block). + for (int i = tid; i < n_hc * n_hc; i += blksz) { + const int off = 2 * n_hc + i; + c[i] = row_in[off] * comb_scale + base[off]; + } + __syncthreads(); + + // Sinkhorn iterations run on thread 0; n_hc <= 16 keeps the inner loops + // trivially cheap (~ 1k FLOPs per row total). + if (tid == 0) { + // First pass: per-dst_hc softmax (max-subtract for numerical stability, + // exp, normalize) + eps stabilizer. + for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) { + float row_max = -INFINITY; + for (int src_hc = 0; src_hc < n_hc; ++src_hc) { + row_max = fmaxf(row_max, c[src_hc + dst_hc * n_hc]); + } + + float row_sum = 0.0f; + for (int src_hc = 0; src_hc < n_hc; ++src_hc) { + const int idx = src_hc + dst_hc * n_hc; + const float v = expf(c[idx] - row_max); + c[idx] = v; + row_sum += v; + } + + const float inv_sum = 1.0f / row_sum; + for (int src_hc = 0; src_hc < n_hc; ++src_hc) { + const int idx = src_hc + dst_hc * n_hc; + c[idx] = c[idx] * inv_sum + eps; + } + } + + // First column-normalize: per src_hc, divide by (column sum + eps). + for (int src_hc = 0; src_hc < n_hc; ++src_hc) { + float sum = 0.0f; + for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) { + sum += c[src_hc + dst_hc * n_hc]; + } + const float inv_denom = 1.0f / (sum + eps); + for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) { + c[src_hc + dst_hc * n_hc] *= inv_denom; + } + } + + // Remaining sinkhorn_iters - 1 alternations: row-normalize then column-normalize. + for (int it = 1; it < sinkhorn_iters; ++it) { + // Row-normalize: per dst_hc, divide by (row sum + eps). + for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) { + float sum = 0.0f; + for (int src_hc = 0; src_hc < n_hc; ++src_hc) { + sum += c[src_hc + dst_hc * n_hc]; + } + const float inv_denom = 1.0f / (sum + eps); + for (int src_hc = 0; src_hc < n_hc; ++src_hc) { + c[src_hc + dst_hc * n_hc] *= inv_denom; + } + } + // Column-normalize: per src_hc, divide by (column sum + eps). + for (int src_hc = 0; src_hc < n_hc; ++src_hc) { + float sum = 0.0f; + for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) { + sum += c[src_hc + dst_hc * n_hc]; + } + const float inv_denom = 1.0f / (sum + eps); + for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) { + c[src_hc + dst_hc * n_hc] *= inv_denom; + } + } + } + } + __syncthreads(); + + // Copy the comb matrix back to dst (parallel over the block). + for (int i = tid; i < n_hc * n_hc; i += blksz) { + row_out[2 * n_hc + i] = c[i]; + } + + // Suppress unused-warning for mix_hc; it's covered by the host-side asserts. + (void) mix_hc; +} + +void ggml_cuda_op_dsv4_hc_split_sinkhorn(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * mixes = dst->src[0]; + const ggml_tensor * scale = dst->src[1]; + const ggml_tensor * base = dst->src[2]; + + GGML_ASSERT(mixes->type == GGML_TYPE_F32); + GGML_ASSERT(scale->type == GGML_TYPE_F32); + GGML_ASSERT(base->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(mixes->nb[0] == sizeof(float)); + GGML_ASSERT(scale->nb[0] == sizeof(float)); + GGML_ASSERT(base->nb[0] == sizeof(float)); + GGML_ASSERT(dst->nb[0] == sizeof(float)); + + const int n_hc = ggml_get_op_params_i32(dst, 0); + const int sinkhorn_iters = ggml_get_op_params_i32(dst, 1); + const float eps = ggml_get_op_params_f32(dst, 2); + + GGML_ASSERT(n_hc > 0 && n_hc <= DSV4_HC_SINKHORN_MAX_N_HC); + GGML_ASSERT(sinkhorn_iters > 0); + + const int n_rows = (int) ggml_nrows(mixes); + const int mix_hc = (int) mixes->ne[0]; + const int nb01 = (int) mixes->nb[1]; + const int nb1 = (int) dst->nb[1]; + + GGML_ASSERT(mix_hc == (2 + n_hc) * n_hc); + GGML_ASSERT((int) ggml_nrows(dst) == n_rows); + + // Block size MUST be a warp multiple (>= 32) so that the in-block + // __syncthreads() barriers are well-formed and any future warp-wide + // shuffle has a complete mask. With mix_hc in {24, 80} the natural + // size is rounded up to 32 or 96. + constexpr int CUDA_WARP_SIZE = 32; + constexpr int CUDA_MAX_BLOCK = 256; + const int rounded = ((mix_hc + CUDA_WARP_SIZE - 1) / CUDA_WARP_SIZE) * CUDA_WARP_SIZE; + const int threads_per_block = std::min(CUDA_MAX_BLOCK, std::max(CUDA_WARP_SIZE, rounded)); + + const dim3 grid(n_rows); + const dim3 block(threads_per_block); + const size_t shared = (size_t) n_hc * (size_t) n_hc * sizeof(float); + + cudaStream_t stream = ctx.stream(); + dsv4_hc_split_sinkhorn_f32<<>>( + (const float *) mixes->data, + (const float *) scale->data, + (const float *) base->data, + (float *) dst->data, + n_hc, sinkhorn_iters, n_rows, mix_hc, + nb01, nb1, eps); + CUDA_CHECK(cudaGetLastError()); +} diff --git a/ggml/src/ggml-cuda/dsv4-hc-split-sinkhorn.cuh b/ggml/src/ggml-cuda/dsv4-hc-split-sinkhorn.cuh new file mode 100644 index 000000000000..d77455298468 --- /dev/null +++ b/ggml/src/ggml-cuda/dsv4-hc-split-sinkhorn.cuh @@ -0,0 +1,42 @@ +#pragma once + +// V4 hyperconnection splitter with Sinkhorn normalization. +// +// Splits the mix vector [mix_hc, n_rows] into three sections: +// - out[0:n_hc] = sigmoid(mix[i] * scale[0] + base[i]) + eps ("pre") +// - out[n_hc:2*n_hc] = 2 * sigmoid(mix[off] * scale[1] + base[off]) ("post") +// - out[2*n_hc:] = Sinkhorn-normalized n_hc x n_hc comb matrix ("comb") +// +// The comb section starts as logits (mix * scale[2] + base), then a +// per-dst_hc row softmax (max-subtract + exp + normalize) with `eps` added, +// then alternating column / row normalizations for sinkhorn_iters - 1 more +// iterations. The result is doubly-stochastic up to `eps`-stabilization. +// +// Expected shape: +// mixes : [mix_hc, n_rows] float32, contiguous along ne[0] +// scale : [3] float32 (pre, post, comb scales) +// base : [mix_hc] float32, matches the mix layout +// dst : [mix_hc, n_rows] float32, same shape as mixes +// where mix_hc == (2 + n_hc) * n_hc and n_hc in [1, 16]. +// +// Op params (i32, i32, f32): n_hc, sinkhorn_iters, eps. +// +// CUDA kernel design: +// - One CUDA block per output row. +// - Block size rounded up to a warp multiple (>= 32) so __syncthreads() +// and any future block-wide reductions are well-formed even when the +// natural row width (mix_hc = 24 or 80 for n_hc = 4 or 8) is not a +// warp multiple. Excess threads do no memory work; loops guard `i < n`. +// - Sections 1, 2, and the final copy parallelize across the block. +// - Section 3 (Sinkhorn iterations on the n_hc x n_hc comb matrix) is +// serialized on `tid == 0`; n_hc <= 16 makes this trivially cheap +// (O(n_hc^2 * sinkhorn_iters) per row) and avoids the complexity of +// warp-cooperative reductions over a 4-or-8-wide inner dimension. +// +// Reference Metal kernel: ggml/src/ggml-metal/ggml-metal.metal:2076-2245 +// CPU reference: ggml/src/ggml-cpu/ops.cpp:11037-11117 +// Public API: ggml/include/ggml.h (ggml_dsv4_hc_split_sinkhorn) + +#include "common.cuh" + +void ggml_cuda_op_dsv4_hc_split_sinkhorn(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/dsv4-hc-weighted-sum.cu b/ggml/src/ggml-cuda/dsv4-hc-weighted-sum.cu new file mode 100644 index 000000000000..75f38d2aa7c3 --- /dev/null +++ b/ggml/src/ggml-cuda/dsv4-hc-weighted-sum.cu @@ -0,0 +1,104 @@ +#include "dsv4-hc-weighted-sum.cuh" + +// CUDA port of kernel_dsv4_hc_weighted_sum (ggml-metal.metal:2278-2327). +// +// Layout (FP32 throughout): +// x : {n_embd, n_hc, n_tokens} +// weights : {n_hc, n_tokens} +// dst : {n_embd, n_tokens} +// Output[d, t] = sum_{h=0..n_hc-1} x[d, h, t] * weights[h, t]. +// +// One thread per output element. Total threads = n_embd * n_tokens. +// Strides are passed in BYTES (matching ggml's nb[] convention); element +// access is via `(const char *) base + d*nb0 + h*nb1 + t*nb2` reinterpret +// as `const float *`, identical to the Metal kernel and CPU reference. + +static __global__ void dsv4_hc_weighted_sum_f32( + const char * __restrict__ x, + const char * __restrict__ weights, + char * __restrict__ dst, + const int n_embd, + const int n_hc, + const int n_tokens, + const int64_t nb_x0, + const int64_t nb_x1, + const int64_t nb_x2, + const int64_t nb_w0, + const int64_t nb_w1, + const int64_t nb0, + const int64_t nb1) { + const int64_t gid = (int64_t) blockIdx.x * blockDim.x + threadIdx.x; + const int64_t total = (int64_t) n_embd * n_tokens; + if (gid >= total) { + return; + } + + const int64_t d = gid % n_embd; + const int64_t t = gid / n_embd; + + float acc = 0.0f; + for (int h = 0; h < n_hc; ++h) { + const float xv = *((const float *) (x + d*nb_x0 + h*nb_x1 + t*nb_x2)); + const float wv = *((const float *) (weights + h*nb_w0 + t*nb_w1)); + acc += xv * wv; + } + + *((float *) (dst + d*nb0 + t*nb1)) = acc; +} + +void ggml_cuda_op_dsv4_hc_weighted_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * x = dst->src[0]; + const ggml_tensor * weights = dst->src[1]; + + GGML_ASSERT(x->type == GGML_TYPE_F32); + GGML_ASSERT(weights->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + // Shape contract: see ggml.c:6335-6339 and the CPU reference asserts at + // ggml-cpu/ops.cpp:11129-11140. + GGML_ASSERT(x->ne[0] == dst->ne[0]); + GGML_ASSERT(x->ne[1] == weights->ne[0]); + GGML_ASSERT(x->ne[2] == dst->ne[1]); + GGML_ASSERT(weights->ne[1] == dst->ne[1]); + GGML_ASSERT(x->ne[3] == 1); + GGML_ASSERT(weights->ne[2] == 1); + GGML_ASSERT(weights->ne[3] == 1); + GGML_ASSERT(dst->ne[2] == 1); + GGML_ASSERT(dst->ne[3] == 1); + + const int n_embd = (int) dst->ne[0]; + const int n_hc = (int) x->ne[1]; + const int n_tokens = (int) dst->ne[1]; + + const int64_t nb_x0 = (int64_t) x->nb[0]; + const int64_t nb_x1 = (int64_t) x->nb[1]; + const int64_t nb_x2 = (int64_t) x->nb[2]; + const int64_t nb_w0 = (int64_t) weights->nb[0]; + const int64_t nb_w1 = (int64_t) weights->nb[1]; + const int64_t nb0 = (int64_t) dst->nb[0]; + const int64_t nb1 = (int64_t) dst->nb[1]; + + const int64_t total = (int64_t) n_embd * n_tokens; + if (total == 0) { + return; + } + + constexpr int CUDA_DSV4_HC_WEIGHTED_SUM_BLOCK_SIZE = 256; + const dim3 block_dims(CUDA_DSV4_HC_WEIGHTED_SUM_BLOCK_SIZE, 1, 1); + const dim3 grid_dims((unsigned) ((total + CUDA_DSV4_HC_WEIGHTED_SUM_BLOCK_SIZE - 1) / + CUDA_DSV4_HC_WEIGHTED_SUM_BLOCK_SIZE), + 1, 1); + + cudaStream_t stream = ctx.stream(); + + dsv4_hc_weighted_sum_f32<<>>( + (const char *) x->data, + (const char *) weights->data, + (char *) dst->data, + n_embd, n_hc, n_tokens, + nb_x0, nb_x1, nb_x2, + nb_w0, nb_w1, + nb0, nb1); + + CUDA_CHECK(cudaGetLastError()); +} diff --git a/ggml/src/ggml-cuda/dsv4-hc-weighted-sum.cuh b/ggml/src/ggml-cuda/dsv4-hc-weighted-sum.cuh new file mode 100644 index 000000000000..e6ee9c19a267 --- /dev/null +++ b/ggml/src/ggml-cuda/dsv4-hc-weighted-sum.cuh @@ -0,0 +1,25 @@ +#pragma once + +// V4 hyper-connection weighted-sum: collapses the hc dimension. +// +// out[embd, token] = sum over hc of weights[hc, token] * x[embd, hc, token] +// +// Inputs (all GGML_TYPE_F32): +// dst->src[0] = x shape {n_embd, n_hc, n_tokens, 1} +// dst->src[1] = weights shape {n_hc, n_tokens, 1, 1} +// Output (GGML_TYPE_F32): +// dst shape {n_embd, n_tokens, 1, 1} +// +// Reference Metal kernel: ggml/src/ggml-metal/ggml-metal.metal:2278-2327 +// Reference Metal dispatch: ggml/src/ggml-metal/ggml-metal-ops.cpp:1440-1486 +// CPU reference: ggml/src/ggml-cpu/ops.cpp:11121 (ggml_compute_forward_dsv4_hc_weighted_sum) +// Public API: ggml/include/ggml.h:2574 (ggml_dsv4_hc_weighted_sum) +// +// Implementation: embarrassingly parallel; one thread per output element +// (n_embd * n_tokens total), each thread loops over n_hc to accumulate. +// Strides are kept in bytes (matching the Metal kernel + the ggml tensor +// nb[] convention) and applied via (const char *) base + offset casts. + +#include "common.cuh" + +void ggml_cuda_op_dsv4_hc_weighted_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/dsv4-rope-tail.cu b/ggml/src/ggml-cuda/dsv4-rope-tail.cu new file mode 100644 index 000000000000..abbf9b2181be --- /dev/null +++ b/ggml/src/ggml-cuda/dsv4-rope-tail.cu @@ -0,0 +1,219 @@ +#include "dsv4-rope-tail.cuh" + +#include "ggml.h" // ggml_rope_yarn_corr_dims, ggml_get_op_params_i32 + +#include // std::min / std::max in dispatch +#include // memcpy + +// YaRN helper. Direct port of ggml/src/ggml-cuda/rope.cu:22-41 +// (template rope_yarn). Duplicated here to keep this +// translation unit self-contained — rope.cuh does not currently expose +// the function as a reusable device helper. The math is identical. + +struct dsv4_rope_corr_dims { + float v[2]; +}; + +static __device__ __forceinline__ float dsv4_rope_yarn_ramp( + const float low, const float high, const int i0) { + const float y = (i0 / 2 - low) / max(0.001f, high - low); + return 1.0f - min(1.0f, max(0.0f, y)); +} + +// forward=true: standard rotation; forward=false: inverse (sin flipped). +template +static __device__ __forceinline__ void dsv4_rope_yarn( + const float theta_extrap, const float freq_scale, + const dsv4_rope_corr_dims corr_dims, const int i0, + const float ext_factor, float mscale, + float & cos_theta, float & sin_theta) { + float theta_interp = freq_scale * theta_extrap; + float theta = theta_interp; + if (ext_factor != 0.0f) { + const float ramp_mix = dsv4_rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor; + theta = theta_interp * (1.0f - ramp_mix) + theta_extrap * ramp_mix; + mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); + } + cos_theta = cosf(theta) * mscale; + sin_theta = sinf(theta) * mscale; + if (!forward) { + sin_theta = -sin_theta; + } +} + +// Main kernel. Launch shape matches Metal: +// grid = (ne01, ne02, ne03) +// block = (min(256, ne00), 1, 1) +// Each thread walks the ne00 dim with stride ntg (== blockDim.x). +// Translation of kernel_dsv4_rope_tail_f32 at +// ggml/src/ggml-metal/ggml-metal.metal:4906-4997. +static __global__ void dsv4_rope_tail_f32_kernel( + const float * __restrict__ src0, + const int * __restrict__ pos, + const float * __restrict__ freq_factors, + float * __restrict__ dst, + const int ne00, + const int nb00, const int nb01, const int nb02, const int nb03, + const int nb0, const int nb1, const int nb2, const int nb3, + const int n_dims, + const float freq_base, const float freq_scale, + const float ext_factor, const float attn_factor, + const dsv4_rope_corr_dims corr_dims, + const bool is_neox, const bool inverse) { + const int i1 = blockIdx.x; + const int i2 = blockIdx.y; + const int i3 = blockIdx.z; + const int tid = threadIdx.x; + const int ntg = blockDim.x; + + const int n_nope = ne00 - n_dims; + if (n_nope < 0) { + return; + } + + const float theta_base_pos = (float) pos[i2]; + const float inv_ndims = -1.0f / (float) n_dims; + + const char * src_base = (const char *) src0 + i3 * nb03 + i2 * nb02 + i1 * nb01; + char * dst_base = (char *) dst + i3 * nb3 + i2 * nb2 + i1 * nb1; + + for (int i0 = tid; i0 < ne00; i0 += ntg) { + // Pass-through prefix: non-RoPE portion of the row. + if (i0 < n_nope) { + *((float *) (dst_base + i0 * nb0)) = *((const float *) (src_base + i0 * nb00)); + continue; + } + + const int r = i0 - n_nope; + + if (is_neox) { + const int n_half = n_dims / 2; + if (r >= n_half) { + continue; + } + + const int ic = r; + const int rel_i0 = 2 * ic; + const float theta = theta_base_pos * powf(freq_base, inv_ndims * (float) rel_i0); + const float freq_factor = freq_factors ? freq_factors[ic] : 1.0f; + + float cos_theta; + float sin_theta; + // Use forward=true; inverse handled as a sign flip below to match + // Metal's "if (args.inverse) sin_theta = -sin_theta" pattern. + dsv4_rope_yarn(theta / freq_factor, freq_scale, corr_dims, + rel_i0, ext_factor, attn_factor, + cos_theta, sin_theta); + if (inverse) { + sin_theta = -sin_theta; + } + + const int j0 = n_nope + ic; + const int j1 = n_nope + ic + n_half; + const float x0 = *((const float *) (src_base + j0 * nb00)); + const float x1 = *((const float *) (src_base + j1 * nb00)); + *((float *) (dst_base + j0 * nb0)) = x0 * cos_theta - x1 * sin_theta; + *((float *) (dst_base + j1 * nb0)) = x0 * sin_theta + x1 * cos_theta; + } else { + // NORMAL mode: rotate adjacent pair (j0, j0+1). + if ((r & 1) != 0) { + continue; + } + + const int ic = r / 2; + const float theta = theta_base_pos * powf(freq_base, inv_ndims * (float) r); + const float freq_factor = freq_factors ? freq_factors[ic] : 1.0f; + + float cos_theta; + float sin_theta; + dsv4_rope_yarn(theta / freq_factor, freq_scale, corr_dims, + r, ext_factor, attn_factor, + cos_theta, sin_theta); + if (inverse) { + sin_theta = -sin_theta; + } + + const int j0 = n_nope + r; + const int j1 = j0 + 1; + const float x0 = *((const float *) (src_base + j0 * nb00)); + const float x1 = *((const float *) (src_base + j1 * nb00)); + *((float *) (dst_base + j0 * nb0)) = x0 * cos_theta - x1 * sin_theta; + *((float *) (dst_base + j1 * nb0)) = x0 * sin_theta + x1 * cos_theta; + } + } +} + +void ggml_cuda_op_dsv4_rope_tail(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * pos = dst->src[1]; + const ggml_tensor * ff = dst->src[2]; // optional; may be NULL + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(pos->type == GGML_TYPE_I32); + + // op_params layout — matches Metal dispatch at + // ggml/src/ggml-metal/ggml-metal-ops.cpp:1606-1623 verbatim: + // [0] = n_dims (i32) + // [1] = mode (i32) + // [2] = n_ctx_orig (i32) + // [3] = inverse (i32, treated as bool) + // [4] = freq_base (f32) + // [5] = freq_scale (f32) + // [6] = ext_factor (f32) + // [7] = attn_factor (f32) + // [8] = beta_fast (f32) + // [9] = beta_slow (f32) + const int32_t n_dims = ggml_get_op_params_i32(dst, 0); + const int32_t mode = ggml_get_op_params_i32(dst, 1); + const int32_t n_ctx_orig = ggml_get_op_params_i32(dst, 2); + const int32_t inverse_i = ggml_get_op_params_i32(dst, 3); + const bool inverse = inverse_i != 0; + + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; + memcpy(&freq_base, (const int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (const int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&ext_factor, (const int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&attn_factor, (const int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&beta_fast, (const int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_slow, (const int32_t *) dst->op_params + 9, sizeof(float)); + + const bool is_neox = (mode == GGML_ROPE_TYPE_NEOX); + + // Precompute YaRN corr_dims host-side (matches Metal call at + // ggml/src/ggml-metal/ggml-metal.metal:4927). + dsv4_rope_corr_dims corr_dims; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v); + + const int ne00 = (int) src0->ne[0]; + const int ne01 = (int) src0->ne[1]; + const int ne02 = (int) src0->ne[2]; + const int ne03 = (int) src0->ne[3]; + + GGML_ASSERT(ne01 > 0 && ne02 > 0 && ne03 > 0); + + const int nth = std::min(256, std::max(1, ne00)); + const dim3 grid(ne01, ne02, ne03); + const dim3 block(nth, 1, 1); + + cudaStream_t stream = ctx.stream(); + dsv4_rope_tail_f32_kernel<<>>( + (const float *) src0->data, + (const int *) pos->data, + ff ? (const float *) ff->data : nullptr, + (float *) dst->data, + ne00, + (int) src0->nb[0], (int) src0->nb[1], (int) src0->nb[2], (int) src0->nb[3], + (int) dst->nb[0], (int) dst->nb[1], (int) dst->nb[2], (int) dst->nb[3], + n_dims, + freq_base, freq_scale, ext_factor, attn_factor, + corr_dims, + is_neox, inverse); + + CUDA_CHECK(cudaGetLastError()); +} diff --git a/ggml/src/ggml-cuda/dsv4-rope-tail.cuh b/ggml/src/ggml-cuda/dsv4-rope-tail.cuh new file mode 100644 index 000000000000..24c1e5dac4fd --- /dev/null +++ b/ggml/src/ggml-cuda/dsv4-rope-tail.cuh @@ -0,0 +1,26 @@ +#pragma once + +// V4 partial-RoPE: applies RoPE rotation to the last n_dims elements of each +// row, leaving the non-RoPE prefix (i.e. the first ne00 - n_dims elements) +// unchanged. The rotation math is the same as ggml_rope_ext (with YaRN +// extrapolation when ext_factor != 0), restricted to the tail. +// +// Reference Metal kernel: ggml/src/ggml-metal/ggml-metal.metal:4906-4997 +// CPU reference: ggml/src/ggml-cpu/ops.cpp:5961 +// Public API: ggml/include/ggml.h:2599 (ggml_dsv4_rope_tail) +// +// The dispatch function extracts op_params (i32 slots 0..3: +// n_dims, mode, n_ctx_orig, inverse; f32 slots 4..9: freq_base, freq_scale, +// ext_factor, attn_factor, beta_fast, beta_slow) from the destination +// tensor, precomputes YaRN corr_dims host-side, and launches the kernel +// with grid = (ne01, ne02, ne03), block.x = min(256, ne00), matching the +// Metal dispatch at ggml/src/ggml-metal/ggml-metal-ops.cpp:1670. +// +// Supports the two RoPE modes the public V4 API allows +// (ggml/src/ggml.c:6426 ASSERT mode == NORMAL || mode == NEOX). All +// other modes are rejected via ggml_backend_cuda_device_supports_op so +// the framework falls back to CPU rather than producing wrong output. + +#include "common.cuh" + +void ggml_cuda_op_dsv4_rope_tail(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index beeb52389464..a0feb032e215 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -391,6 +391,18 @@ static __device__ __forceinline__ void dequantize_V_q4_0(const void * __restrict ((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]); } } else +#else // FP16_AVAILABLE + // Software-FP16 fallback: compute in float, pack via __floats2half2_rn (RTNE). + // Required because the half-precision V-dequant template is instantiated + // for the full CC matrix (50/61/70/...), but FP16_AVAILABLE is undefined for CC < 600. + if constexpr (std::is_same_v) { + const float d = __half2float(x[ib].d); + +#pragma unroll + for (int l0 = 0; l0 < ne; l0 += 2) { + ((half2 *) dst)[l0/2] = __floats2half2_rn(d * q8[l0 + 0], d * q8[l0 + 1]); + } + } else #endif // FP16_AVAILABLE if constexpr (std::is_same_v) { const float d = x[ib].d; @@ -431,6 +443,16 @@ static __device__ __forceinline__ void dequantize_V_q4_1(const void * __restrict ((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]) + m; } } else +#else // FP16_AVAILABLE + // Software-FP16 fallback: see dequantize_V_q4_0 for the rationale. + if constexpr (std::is_same_v) { + const float2 dm = __half22float2(x[ib].dm); + +#pragma unroll + for (int l0 = 0; l0 < ne; l0 += 2) { + ((half2 *) dst)[l0/2] = __floats2half2_rn(dm.x * q8[l0 + 0] + dm.y, dm.x * q8[l0 + 1] + dm.y); + } + } else #endif // FP16_AVAILABLE if constexpr (std::is_same_v) { const float2 dm = __half22float2(x[ib].dm); @@ -481,6 +503,16 @@ static __device__ __forceinline__ void dequantize_V_q5_0(const void * __restrict ((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]); } } else +#else // FP16_AVAILABLE + // Software-FP16 fallback: see dequantize_V_q4_0 for the rationale. + if constexpr (std::is_same_v) { + const float d = __half2float(x[ib].d); + +#pragma unroll + for (int l0 = 0; l0 < ne; l0 += 2) { + ((half2 *) dst)[l0/2] = __floats2half2_rn(d * q8[l0 + 0], d * q8[l0 + 1]); + } + } else #endif // FP16_AVAILABLE if constexpr (std::is_same_v) { const float d = x[ib].d; @@ -531,6 +563,16 @@ static __device__ __forceinline__ void dequantize_V_q5_1(const void * __restrict ((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]) + m; } } else +#else // FP16_AVAILABLE + // Software-FP16 fallback: see dequantize_V_q4_0 for the rationale. + if constexpr (std::is_same_v) { + const float2 dm = __half22float2(x[ib].dm); + +#pragma unroll + for (int l0 = 0; l0 < ne; l0 += 2) { + ((half2 *) dst)[l0/2] = __floats2half2_rn(dm.x * q8[l0 + 0] + dm.y, dm.x * q8[l0 + 1] + dm.y); + } + } else #endif // FP16_AVAILABLE if constexpr (std::is_same_v) { const float2 dm = __half22float2(x[ib].dm); @@ -564,6 +606,16 @@ static __device__ __forceinline__ void dequantize_V_q8_0(const void * __restrict ((half2 *) dst)[l0/2] = d * make_half2(qs[l0 + 0], qs[l0 + 1]); } } else +#else // FP16_AVAILABLE + // Software-FP16 fallback: see dequantize_V_q4_0 for the rationale. + if constexpr (std::is_same::value) { + const float d = __half2float(x[ib].d); + +#pragma unroll + for (int l0 = 0; l0 < ne; l0 += 2) { + ((half2 *) dst)[l0/2] = __floats2half2_rn(d * qs[l0 + 0], d * qs[l0 + 1]); + } + } else #endif // FP16_AVAILABLE if constexpr (std::is_same::value) { const float d = x[ib].d; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index e25be3592fd4..e0fbceec7a0e 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -23,6 +23,11 @@ #include "ggml-cuda/cumsum.cuh" #include "ggml-cuda/diagmask.cuh" #include "ggml-cuda/diag.cuh" +#include "ggml-cuda/dsv4-fp8-kv-quantize.cuh" +#include "ggml-cuda/dsv4-hc-expand.cuh" +#include "ggml-cuda/dsv4-hc-split-sinkhorn.cuh" +#include "ggml-cuda/dsv4-hc-weighted-sum.cuh" +#include "ggml-cuda/dsv4-rope-tail.cuh" #include "ggml-cuda/fattn.cuh" #include "ggml-cuda/getrows.cuh" #include "ggml-cuda/im2col.cuh" @@ -2777,7 +2782,52 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * nb1, nb2, nb3, stream); } +// ---------- DSV4 debug logging (env-gated, set GGML_DSV4_DEBUG=1 to enable) ---------- +static bool dsv4_debug_enabled() { + static const bool enabled = (getenv("GGML_DSV4_DEBUG") != nullptr); + return enabled; +} + +static const char * dsv4_op_short(enum ggml_op op) { + switch (op) { + case GGML_OP_DSV4_ROPE_TAIL: return "DSV4_ROPE_TAIL"; + case GGML_OP_DSV4_HC_SPLIT_SINKHORN: return "DSV4_HC_SPLIT_SINKHORN"; + case GGML_OP_DSV4_HC_WEIGHTED_SUM: return "DSV4_HC_WEIGHTED_SUM"; + case GGML_OP_DSV4_HC_EXPAND: return "DSV4_HC_EXPAND"; + case GGML_OP_DSV4_FP8_KV_QUANTIZE: return "DSV4_FP8_KV_QUANTIZE"; + default: return nullptr; + } +} + +static bool dsv4_op_is_v4(enum ggml_op op) { + return dsv4_op_short(op) != nullptr; +} + +static void dsv4_log_op_entry(int device, const struct ggml_tensor * dst) { + if (!dsv4_debug_enabled() || !dsv4_op_is_v4(dst->op)) return; + fprintf(stderr, "[DSV4_DEBUG] dev=%d op=%s dst=%s(%s) shape=[%lld,%lld,%lld,%lld]\n", + device, dsv4_op_short(dst->op), + dst->name, ggml_type_name(dst->type), + (long long) dst->ne[0], (long long) dst->ne[1], + (long long) dst->ne[2], (long long) dst->ne[3]); + for (int i = 0; i < GGML_MAX_SRC; i++) { + if (!dst->src[i]) continue; + const char * buft_name = "(null-buf)"; + int is_split = 0; + if (dst->src[i]->buffer) { + buft_name = ggml_backend_buft_name(dst->src[i]->buffer->buft); + is_split = ggml_backend_buft_is_cuda_split(dst->src[i]->buffer->buft) ? 1 : 0; + } + fprintf(stderr, "[DSV4_DEBUG] src[%d]=%s(%s) buft=%s split=%d data=%p extra=%p\n", + i, dst->src[i]->name, ggml_type_name(dst->src[i]->type), + buft_name, is_split, dst->src[i]->data, (void *) dst->src[i]->extra); + } + fflush(stderr); +} +// ---------- end DSV4 debug ---------- + static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) { + dsv4_log_op_entry(ctx.device, dst); switch (dst->op) { case GGML_OP_ARGMAX: ggml_cuda_argmax(ctx, dst); @@ -3020,6 +3070,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_ROPE_BACK: ggml_cuda_op_rope_back(ctx, dst); break; + case GGML_OP_DSV4_ROPE_TAIL: + ggml_cuda_op_dsv4_rope_tail(ctx, dst); + break; case GGML_OP_ROLL: ggml_cuda_op_roll(ctx, dst); break; @@ -3089,6 +3142,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_RWKV_WKV7: ggml_cuda_op_rwkv_wkv7(ctx, dst); break; + case GGML_OP_DSV4_HC_EXPAND: + ggml_cuda_op_dsv4_hc_expand(ctx, dst); + break; + case GGML_OP_DSV4_FP8_KV_QUANTIZE: + ggml_cuda_op_dsv4_fp8_kv_quantize(ctx, dst); + break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_cuda_cross_entropy_loss_back(ctx, dst); break; @@ -3104,6 +3163,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_FILL: ggml_cuda_op_fill(ctx, dst); break; + case GGML_OP_DSV4_HC_SPLIT_SINKHORN: + ggml_cuda_op_dsv4_hc_split_sinkhorn(ctx, dst); + break; + case GGML_OP_DSV4_HC_WEIGHTED_SUM: + ggml_cuda_op_dsv4_hc_weighted_sum(ctx, dst); + break; default: return false; } @@ -3208,7 +3273,30 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_ #ifdef GGML_CUDA_NO_PEER_COPY return false; #else + if (dsv4_debug_enabled()) { + fprintf(stderr, "[DSV4_DEBUG] peer-copy: src_dev=%d dst_dev=%d bytes=%zu " + "src=%s(%s,op=%s,buft=%s) dst=%s(%s,op=%s,buft=%s) src_ptr=%p dst_ptr=%p\n", + cuda_ctx_src->device, cuda_ctx_dst->device, ggml_nbytes(dst), + src->name, ggml_type_name(src->type), ggml_op_name(src->op), + src->buffer ? ggml_backend_buft_name(src->buffer->buft) : "?", + dst->name, ggml_type_name(dst->type), ggml_op_name(dst->op), + dst->buffer ? ggml_backend_buft_name(dst->buffer->buft) : "?", + src->data, dst->data); + fflush(stderr); + // Force any deferred CUDA error to surface BEFORE the next op, so the log line + // immediately above truly identifies the failing copy (codex review nit #1). + cudaError_t pre_err = cudaGetLastError(); + if (pre_err != cudaSuccess) { + fprintf(stderr, "[DSV4_DEBUG] pre-copy stale error: %s\n", cudaGetErrorString(pre_err)); + fflush(stderr); + } + } CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream())); + if (dsv4_debug_enabled()) { + // Synchronous wait to force the async error (if any) to surface at the offending copy, + // not at some later API call. Heavy perturbation — only with GGML_DSV4_DEBUG=1. + CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_src->stream())); + } #endif // GGML_CUDA_NO_PEER_COPY } @@ -5034,8 +5122,17 @@ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type( static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context; - // split buffers can only be used with GGML_OP_MUL_MAT - if (op->op != GGML_OP_MUL_MAT) { + // split buffers can only be used with GGML_OP_MUL_MAT and DeepSeek V4 custom ops. + // Without the DSV4 exception, multi-GPU scheduler rejects the V4 ops once their + // weight tensors land in cuda_split buffers and falls back to CPU — which then + // corrupts data via host<->device transfer mismatches and crashes during decode. + // Reported and root-caused by @DenisVASI9 on an 8x A100 40GB rig. + if (op->op != GGML_OP_MUL_MAT && + op->op != GGML_OP_DSV4_HC_SPLIT_SINKHORN && + op->op != GGML_OP_DSV4_HC_WEIGHTED_SUM && + op->op != GGML_OP_DSV4_HC_EXPAND && + op->op != GGML_OP_DSV4_FP8_KV_QUANTIZE && + op->op != GGML_OP_DSV4_ROPE_TAIL) { for (int i = 0; i < GGML_MAX_SRC; i++) { if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda_split(op->src[i]->buffer->buft)) { return false; @@ -5053,6 +5150,30 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g } } + // Some ops write through a pre-allocated destination buffer (e.g. SET_ROWS + // into a KV cache). For those, the dst lives on a specific device — dispatching + // the op on a different device causes the CUDA kernel to write through a + // foreign-device pointer (dst->data), surfacing as cudaErrorIllegalAddress. + // + // SET_ROWS returns a view tensor (ggml_view_tensor(ctx, a)) so op->buffer is + // nullptr. We must walk the view chain to find the real buffer. + // Diagnosed via CUDA_LAUNCH_BLOCKING=1 + GGML_DSV4_DEBUG=1 on @DenisVASI9's + // 8x A100 rig: V4's dsv4_store_cache_rows emits SET_ROWS at layer-7 K-cache + // (on CUDA1) while sched dispatched on CUDA0 → illegal access. + { + const ggml_tensor * t = op; + while (t->view_src) { + t = t->view_src; + } + if (t->buffer && ggml_backend_buft_is_cuda(t->buffer->buft)) { + ggml_backend_cuda_buffer_type_context * buft_ctx = + (ggml_backend_cuda_buffer_type_context *) t->buffer->buft->context; + if (buft_ctx->device != dev_ctx->device) { + return false; + } + } + } + switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { @@ -5358,6 +5479,23 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_ROPE_BACK: { return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]); } + case GGML_OP_DSV4_ROPE_TAIL: { + // Only F32 in/out is supported on this kernel (matches Metal kargs). + if (op->src[0]->type != GGML_TYPE_F32 || op->type != GGML_TYPE_F32) { + return false; + } + // Kernel implements mode == NORMAL (0) and mode == NEOX (2); + // any other mode is rejected so the framework falls back to CPU + // rather than producing wrong output. ggml/src/ggml.c:6426 ASSERTs + // this constraint at op-construction time, but we re-check here + // for defense-in-depth. + const int32_t mode = ggml_get_op_params_i32(op, 1); + if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX) { + return false; + } + // Same contiguity requirement as GGML_OP_ROPE. + return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]); + } case GGML_OP_IM2COL: case GGML_OP_IM2COL_3D: case GGML_OP_CONV_2D: @@ -5393,6 +5531,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_GATED_LINEAR_ATTN: case GGML_OP_RWKV_WKV7: return true; + case GGML_OP_DSV4_HC_EXPAND: + return op->type == GGML_TYPE_F32 + && op->src[0]->type == GGML_TYPE_F32 + && op->src[1]->type == GGML_TYPE_F32 + && op->src[2]->type == GGML_TYPE_F32 + && op->src[3]->type == GGML_TYPE_F32; case GGML_OP_GATED_DELTA_NET: //TODO: enable once MUSA compiler is solved https://github.com/ggml-org/llama.cpp/pull/19504#issuecomment-4018634327 #ifdef GGML_USE_MUSA @@ -5400,6 +5544,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g #else return true; #endif // GGML_USE_MUSA + case GGML_OP_DSV4_FP8_KV_QUANTIZE: + return op->type == GGML_TYPE_F32 + && op->src[0]->type == GGML_TYPE_F32; case GGML_OP_FLASH_ATTN_EXT: return ggml_cuda_flash_attn_ext_supported(dev_ctx->device, op); case GGML_OP_CROSS_ENTROPY_LOSS: @@ -5412,6 +5559,15 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_DIAG: case GGML_OP_SOLVE_TRI: return true; + case GGML_OP_DSV4_HC_SPLIT_SINKHORN: + return op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_F32 && + op->src[2]->type == GGML_TYPE_F32 && + op->type == GGML_TYPE_F32; + case GGML_OP_DSV4_HC_WEIGHTED_SUM: + return op->type == GGML_TYPE_F32 + && op->src[0]->type == GGML_TYPE_F32 + && op->src[1]->type == GGML_TYPE_F32; default: return false; diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp index e288a27f992a..d6e5bf98e04f 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.cpp +++ b/ggml/src/ggml-metal/ggml-metal-device.cpp @@ -459,6 +459,54 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_soft_max(ggml_me return res; } +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_hc_split_sinkhorn(ggml_metal_library_t lib, const ggml_tensor * op) { + GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32); + GGML_ASSERT(op->src[2]->type == GGML_TYPE_F32); + GGML_ASSERT(op->type == GGML_TYPE_F32); + + const char * name = "kernel_dsv4_hc_split_sinkhorn"; + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + res = ggml_metal_library_compile_pipeline(lib, name, name, nullptr); + } + + return res; +} + +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_hc_weighted_sum(ggml_metal_library_t lib, const ggml_tensor * op) { + GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32); + GGML_ASSERT(op->type == GGML_TYPE_F32); + + const char * name = "kernel_dsv4_hc_weighted_sum"; + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + res = ggml_metal_library_compile_pipeline(lib, name, name, nullptr); + } + + return res; +} + +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_hc_expand(ggml_metal_library_t lib, const ggml_tensor * op) { + GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32); + GGML_ASSERT(op->src[2]->type == GGML_TYPE_F32); + GGML_ASSERT(op->src[3]->type == GGML_TYPE_F32); + GGML_ASSERT(op->type == GGML_TYPE_F32); + + const char * name = "kernel_dsv4_hc_expand"; + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + res = ggml_metal_library_compile_pipeline(lib, name, name, nullptr); + } + + return res; +} + ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv(ggml_metal_library_t lib, const ggml_tensor * op) { GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32); @@ -1429,6 +1477,36 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext( return res; } +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_fp8_kv_quantize(ggml_metal_library_t lib, const ggml_tensor * op) { + GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(op->type == GGML_TYPE_F32); + + const char * name = "kernel_dsv4_fp8_kv_quantize_f32"; + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + res = ggml_metal_library_compile_pipeline(lib, name, name, nullptr); + } + + res.smem = 64*sizeof(float); + + return res; +} + +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_rope_tail(ggml_metal_library_t lib, const ggml_tensor * op) { + GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(op->type == GGML_TYPE_F32); + + const char * name = "kernel_dsv4_rope_tail_f32"; + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + res = ggml_metal_library_compile_pipeline(lib, name, name, nullptr); + } + + return res; +} + ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_vec( ggml_metal_library_t lib, const ggml_tensor * op, diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h index 1f212a92f985..1e3a8485f922 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.h +++ b/ggml/src/ggml-metal/ggml-metal-device.h @@ -123,6 +123,11 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_cumsum_bl struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_cumsum_add (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_tri (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_soft_max (ggml_metal_library_t lib, const struct ggml_tensor * op); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_hc_split_sinkhorn(ggml_metal_library_t lib, const struct ggml_tensor * op); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_hc_weighted_sum(ggml_metal_library_t lib, const struct ggml_tensor * op); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_hc_expand (ggml_metal_library_t lib, const struct ggml_tensor * op); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_fp8_kv_quantize(ggml_metal_library_t lib, const struct ggml_tensor * op); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_rope_tail (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv_batched (ggml_metal_library_t lib, const struct ggml_tensor * op, int ssm_conv_bs); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan (ggml_metal_library_t lib, const struct ggml_tensor * op); diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index 780dfe81bb3c..f8b2e65c2a46 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -1187,6 +1187,53 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te case GGML_OP_SSM_CONV: case GGML_OP_SSM_SCAN: return has_simdgroup_reduction; + case GGML_OP_DSV4_HC_SPLIT_SINKHORN: + return ggml_is_contiguous_rows(op->src[0]) && + ggml_is_contiguous(op->src[1]) && + ggml_is_contiguous(op->src[2]) && + op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_F32 && + op->src[2]->type == GGML_TYPE_F32 && + op->type == GGML_TYPE_F32; + case GGML_OP_DSV4_HC_WEIGHTED_SUM: + return op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_F32 && + op->type == GGML_TYPE_F32 && + op->src[0]->ne[0] == op->ne[0] && + op->src[0]->ne[1] == op->src[1]->ne[0] && + op->src[0]->ne[2] == op->ne[1] && + op->src[1]->ne[1] == op->ne[1]; + case GGML_OP_DSV4_HC_EXPAND: + return op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_F32 && + op->src[2]->type == GGML_TYPE_F32 && + op->src[3]->type == GGML_TYPE_F32 && + op->type == GGML_TYPE_F32 && + op->src[0]->ne[0] == op->ne[0] && + op->src[0]->ne[1] == op->ne[2] && + op->src[1]->ne[0] == op->ne[0] && + op->src[1]->ne[1] == op->ne[1] && + op->src[1]->ne[2] == op->ne[2] && + op->src[2]->ne[0] == op->ne[1] && + op->src[2]->ne[1] == op->ne[2] && + op->src[3]->ne[0] == op->ne[1] && + op->src[3]->ne[1] == op->ne[1] && + op->src[3]->ne[2] == op->ne[2]; + case GGML_OP_DSV4_FP8_KV_QUANTIZE: + return op->src[0]->type == GGML_TYPE_F32 && + op->type == GGML_TYPE_F32 && + op->src[0]->ne[0] > ggml_get_op_params_i32(op, 0) && + (op->src[0]->ne[0] - ggml_get_op_params_i32(op, 0)) % 64 == 0; + case GGML_OP_DSV4_ROPE_TAIL: + return op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_I32 && + op->type == GGML_TYPE_F32 && + op->src[0]->ne[2] == op->src[1]->ne[0] && + ggml_get_op_params_i32(op, 0) > 0 && + ggml_get_op_params_i32(op, 0) <= op->src[0]->ne[0] && + ggml_get_op_params_i32(op, 0) % 2 == 0 && + (ggml_get_op_params_i32(op, 1) == GGML_ROPE_TYPE_NORMAL || + ggml_get_op_params_i32(op, 1) == GGML_ROPE_TYPE_NEOX); case GGML_OP_RWKV_WKV6: case GGML_OP_RWKV_WKV7: return true; diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index ff74cafb5b79..48301eff4709 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -798,6 +798,90 @@ typedef struct { int32_t n_head_log2; } ggml_metal_kargs_soft_max; +typedef struct { + int32_t n_hc; + int32_t sinkhorn_iters; + int64_t n_rows; + int64_t mix_hc; + uint64_t nb01; + uint64_t nb1; + float eps; +} ggml_metal_kargs_dsv4_hc_split_sinkhorn; + +typedef struct { + int64_t n_embd; + int64_t n_hc; + int64_t n_tokens; + uint64_t nb_x0; + uint64_t nb_x1; + uint64_t nb_x2; + uint64_t nb_w0; + uint64_t nb_w1; + uint64_t nb0; + uint64_t nb1; +} ggml_metal_kargs_dsv4_hc_weighted_sum; + +typedef struct { + int64_t n_embd; + int64_t n_hc; + int64_t n_tokens; + uint64_t nb_block0; + uint64_t nb_block1; + uint64_t nb_res0; + uint64_t nb_res1; + uint64_t nb_res2; + uint64_t nb_post0; + uint64_t nb_post1; + uint64_t nb_comb0; + uint64_t nb_comb1; + uint64_t nb_comb2; + uint64_t nb0; + uint64_t nb1; + uint64_t nb2; +} ggml_metal_kargs_dsv4_hc_expand; + +typedef struct { + int64_t ne00; + int64_t ne01; + int64_t ne02; + int64_t ne03; + uint64_t nb00; + uint64_t nb01; + uint64_t nb02; + uint64_t nb03; + uint64_t nb0; + uint64_t nb1; + uint64_t nb2; + uint64_t nb3; + int32_t n_rot; +} ggml_metal_kargs_dsv4_fp8_kv_quantize; + +typedef struct { + int64_t ne00; + int64_t ne01; + int64_t ne02; + int64_t ne03; + uint64_t nb00; + uint64_t nb01; + uint64_t nb02; + uint64_t nb03; + uint64_t nb0; + uint64_t nb1; + uint64_t nb2; + uint64_t nb3; + int32_t n_dims; + int32_t mode; + int32_t n_ctx_orig; + int32_t inverse; + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; + bool src2; +} ggml_metal_kargs_dsv4_rope_tail; + typedef struct { int64_t ne00; int64_t ne01; diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index a114391c2e8c..b2f6ed37847d 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -320,6 +320,26 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) { { n_fuse = ggml_metal_op_soft_max(ctx, idx); } break; + case GGML_OP_DSV4_HC_SPLIT_SINKHORN: + { + n_fuse = ggml_metal_op_dsv4_hc_split_sinkhorn(ctx, idx); + } break; + case GGML_OP_DSV4_HC_WEIGHTED_SUM: + { + n_fuse = ggml_metal_op_dsv4_hc_weighted_sum(ctx, idx); + } break; + case GGML_OP_DSV4_HC_EXPAND: + { + n_fuse = ggml_metal_op_dsv4_hc_expand(ctx, idx); + } break; + case GGML_OP_DSV4_FP8_KV_QUANTIZE: + { + n_fuse = ggml_metal_op_dsv4_fp8_kv_quantize(ctx, idx); + } break; + case GGML_OP_DSV4_ROPE_TAIL: + { + n_fuse = ggml_metal_op_dsv4_rope_tail(ctx, idx); + } break; case GGML_OP_SSM_CONV: { n_fuse = ggml_metal_op_ssm_conv(ctx, idx); @@ -1369,6 +1389,289 @@ int ggml_metal_op_soft_max(ggml_metal_op_t ctx, int idx) { return 1; } +int ggml_metal_op_dsv4_hc_split_sinkhorn(ggml_metal_op_t ctx, int idx) { + ggml_tensor * op = ctx->node(idx); + + ggml_metal_library_t lib = ctx->lib; + ggml_metal_encoder_t enc = ctx->enc; + + GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32); + GGML_ASSERT(op->src[2]->type == GGML_TYPE_F32); + GGML_ASSERT(op->type == GGML_TYPE_F32); + GGML_ASSERT(op->src[0]->ne[2] == 1); + GGML_ASSERT(op->src[0]->ne[3] == 1); + + const int32_t n_hc = ggml_get_op_params_i32(op, 0); + const int32_t sinkhorn_iters = ggml_get_op_params_i32(op, 1); + const float eps = ggml_get_op_params_f32(op, 2); + + GGML_TENSOR_LOCALS(int64_t, ne0, op->src[0], ne); + GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb); + GGML_TENSOR_LOCALS(uint64_t, nb, op, nb); + + ggml_metal_kargs_dsv4_hc_split_sinkhorn args = { + /*.n_hc =*/ n_hc, + /*.sinkhorn_iters =*/ sinkhorn_iters, + /*.n_rows =*/ ne01*ne02*ne03, + /*.mix_hc =*/ ne00, + /*.nb01 =*/ nb01, + /*.nb1 =*/ nb1, + /*.eps =*/ eps, + }; + + auto pipeline = ggml_metal_library_get_pipeline_dsv4_hc_split_sinkhorn(lib, op); + + const int nth = std::min(256, std::max(1, args.n_rows)); + const int n_tg = (args.n_rows + nth - 1) / nth; + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[2]), 3); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 4); + + ggml_metal_encoder_dispatch_threadgroups(enc, n_tg, 1, 1, nth, 1, 1); + + return 1; +} + +int ggml_metal_op_dsv4_hc_weighted_sum(ggml_metal_op_t ctx, int idx) { + ggml_tensor * op = ctx->node(idx); + + ggml_metal_library_t lib = ctx->lib; + ggml_metal_encoder_t enc = ctx->enc; + + GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32); + GGML_ASSERT(op->type == GGML_TYPE_F32); + + ggml_tensor * x = op->src[0]; + ggml_tensor * weights = op->src[1]; + + GGML_TENSOR_LOCALS(int64_t, ne, op, ne); + GGML_TENSOR_LOCALS(uint64_t, nb, op, nb); + GGML_TENSOR_LOCALS(uint64_t, nb_x, x, nb); + GGML_TENSOR_LOCALS(uint64_t, nb_w, weights, nb); + + ggml_metal_kargs_dsv4_hc_weighted_sum args = { + /*.n_embd =*/ ne0, + /*.n_hc =*/ x->ne[1], + /*.n_tokens =*/ ne1, + /*.nb_x0 =*/ nb_x0, + /*.nb_x1 =*/ nb_x1, + /*.nb_x2 =*/ nb_x2, + /*.nb_w0 =*/ nb_w0, + /*.nb_w1 =*/ nb_w1, + /*.nb0 =*/ nb0, + /*.nb1 =*/ nb1, + }; + + auto pipeline = ggml_metal_library_get_pipeline_dsv4_hc_weighted_sum(lib, op); + + const int64_t n_elem = ne0*ne1; + const int nth = std::min(256, std::max(1, n_elem)); + const int n_tg = (n_elem + nth - 1) / nth; + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(x), 1); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(weights), 2); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3); + + ggml_metal_encoder_dispatch_threadgroups(enc, n_tg, 1, 1, nth, 1, 1); + + return 1; +} + +int ggml_metal_op_dsv4_hc_expand(ggml_metal_op_t ctx, int idx) { + ggml_tensor * op = ctx->node(idx); + + ggml_metal_library_t lib = ctx->lib; + ggml_metal_encoder_t enc = ctx->enc; + + GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32); + GGML_ASSERT(op->src[2]->type == GGML_TYPE_F32); + GGML_ASSERT(op->src[3]->type == GGML_TYPE_F32); + GGML_ASSERT(op->type == GGML_TYPE_F32); + + ggml_tensor * block_out = op->src[0]; + ggml_tensor * residual = op->src[1]; + ggml_tensor * post = op->src[2]; + ggml_tensor * comb = op->src[3]; + + GGML_TENSOR_LOCALS(int64_t, ne, op, ne); + GGML_TENSOR_LOCALS(uint64_t, nb, op, nb); + GGML_TENSOR_LOCALS(uint64_t, nb_block, block_out, nb); + GGML_TENSOR_LOCALS(uint64_t, nb_res, residual, nb); + GGML_TENSOR_LOCALS(uint64_t, nb_post, post, nb); + GGML_TENSOR_LOCALS(uint64_t, nb_comb, comb, nb); + + ggml_metal_kargs_dsv4_hc_expand args = { + /*.n_embd =*/ ne0, + /*.n_hc =*/ ne1, + /*.n_tokens =*/ ne2, + /*.nb_block0 =*/ nb_block0, + /*.nb_block1 =*/ nb_block1, + /*.nb_res0 =*/ nb_res0, + /*.nb_res1 =*/ nb_res1, + /*.nb_res2 =*/ nb_res2, + /*.nb_post0 =*/ nb_post0, + /*.nb_post1 =*/ nb_post1, + /*.nb_comb0 =*/ nb_comb0, + /*.nb_comb1 =*/ nb_comb1, + /*.nb_comb2 =*/ nb_comb2, + /*.nb0 =*/ nb0, + /*.nb1 =*/ nb1, + /*.nb2 =*/ nb2, + }; + + auto pipeline = ggml_metal_library_get_pipeline_dsv4_hc_expand(lib, op); + + const int64_t n_elem = ne0*ne1*ne2; + const int nth = std::min(256, std::max(1, n_elem)); + const int n_tg = (n_elem + nth - 1) / nth; + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(block_out), 1); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(residual), 2); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(post), 3); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(comb), 4); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 5); + + ggml_metal_encoder_dispatch_threadgroups(enc, n_tg, 1, 1, nth, 1, 1); + + return 1; +} + +int ggml_metal_op_dsv4_fp8_kv_quantize(ggml_metal_op_t ctx, int idx) { + ggml_tensor * op = ctx->node(idx); + + ggml_metal_library_t lib = ctx->lib; + ggml_metal_encoder_t enc = ctx->enc; + + GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(op->type == GGML_TYPE_F32); + + const int32_t n_rot = ggml_get_op_params_i32(op, 0); + + GGML_TENSOR_LOCALS(int64_t, ne0, op->src[0], ne); + GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb); + GGML_TENSOR_LOCALS(uint64_t, nb, op, nb); + + ggml_metal_kargs_dsv4_fp8_kv_quantize args = { + /*.ne00 =*/ ne00, + /*.ne01 =*/ ne01, + /*.ne02 =*/ ne02, + /*.ne03 =*/ ne03, + /*.nb00 =*/ nb00, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.nb0 =*/ nb0, + /*.nb1 =*/ nb1, + /*.nb2 =*/ nb2, + /*.nb3 =*/ nb3, + /*.n_rot =*/ n_rot, + }; + + auto pipeline = ggml_metal_library_get_pipeline_dsv4_fp8_kv_quantize(lib, op); + + const int64_t n_rows = ne01*ne02*ne03; + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2); + ggml_metal_encoder_set_threadgroup_memory_size(enc, pipeline.smem, 0); + + ggml_metal_encoder_dispatch_threadgroups(enc, n_rows, 1, 1, 64, 1, 1); + + return 1; +} + +int ggml_metal_op_dsv4_rope_tail(ggml_metal_op_t ctx, int idx) { + ggml_tensor * op = ctx->node(idx); + + ggml_metal_library_t lib = ctx->lib; + ggml_metal_encoder_t enc = ctx->enc; + + GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(op->src[1]->type == GGML_TYPE_I32); + GGML_ASSERT(op->type == GGML_TYPE_F32); + + const int32_t n_dims = ggml_get_op_params_i32(op, 0); + const int32_t mode = ggml_get_op_params_i32(op, 1); + const int32_t n_ctx_orig = ggml_get_op_params_i32(op, 2); + const int32_t inverse = ggml_get_op_params_i32(op, 3); + + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; + + memcpy(&freq_base, (const int32_t *) op->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (const int32_t *) op->op_params + 5, sizeof(float)); + memcpy(&ext_factor, (const int32_t *) op->op_params + 6, sizeof(float)); + memcpy(&attn_factor, (const int32_t *) op->op_params + 7, sizeof(float)); + memcpy(&beta_fast, (const int32_t *) op->op_params + 8, sizeof(float)); + memcpy(&beta_slow, (const int32_t *) op->op_params + 9, sizeof(float)); + + GGML_TENSOR_LOCALS(int64_t, ne0, op->src[0], ne); + GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb); + GGML_TENSOR_LOCALS(uint64_t, nb, op, nb); + + ggml_metal_kargs_dsv4_rope_tail args = { + /*.ne00 =*/ ne00, + /*.ne01 =*/ ne01, + /*.ne02 =*/ ne02, + /*.ne03 =*/ ne03, + /*.nb00 =*/ nb00, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.nb0 =*/ nb0, + /*.nb1 =*/ nb1, + /*.nb2 =*/ nb2, + /*.nb3 =*/ nb3, + /*.n_dims =*/ n_dims, + /*.mode =*/ mode, + /*.n_ctx_orig =*/ n_ctx_orig, + /*.inverse =*/ inverse, + /*.freq_base =*/ freq_base, + /*.freq_scale =*/ freq_scale, + /*.ext_factor =*/ ext_factor, + /*.attn_factor =*/ attn_factor, + /*.beta_fast =*/ beta_fast, + /*.beta_slow =*/ beta_slow, + /*.src2 =*/ op->src[2] != nullptr, + }; + + auto pipeline = ggml_metal_library_get_pipeline_dsv4_rope_tail(lib, op); + + const int nth = std::min(256, std::max(1, ne00)); + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2); + if (op->src[2]) { + ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[2]), 3); + } else { + ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 3); + } + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 4); + + ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1); + + return 1; +} + int ggml_metal_op_ssm_conv(ggml_metal_op_t ctx, int idx) { ggml_tensor * op = ctx->node(idx); diff --git a/ggml/src/ggml-metal/ggml-metal-ops.h b/ggml/src/ggml-metal/ggml-metal-ops.h index 36c61071b4fa..e402a450e619 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.h +++ b/ggml/src/ggml-metal/ggml-metal-ops.h @@ -55,6 +55,11 @@ int ggml_metal_op_get_rows (ggml_metal_op_t ctx, int idx); int ggml_metal_op_set_rows (ggml_metal_op_t ctx, int idx); int ggml_metal_op_diag (ggml_metal_op_t ctx, int idx); int ggml_metal_op_soft_max (ggml_metal_op_t ctx, int idx); +int ggml_metal_op_dsv4_hc_split_sinkhorn(ggml_metal_op_t ctx, int idx); +int ggml_metal_op_dsv4_hc_weighted_sum(ggml_metal_op_t ctx, int idx); +int ggml_metal_op_dsv4_hc_expand (ggml_metal_op_t ctx, int idx); +int ggml_metal_op_dsv4_fp8_kv_quantize(ggml_metal_op_t ctx, int idx); +int ggml_metal_op_dsv4_rope_tail (ggml_metal_op_t ctx, int idx); int ggml_metal_op_ssm_conv (ggml_metal_op_t ctx, int idx); int ggml_metal_op_ssm_scan (ggml_metal_op_t ctx, int idx); int ggml_metal_op_rwkv (ggml_metal_op_t ctx, int idx); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index f6ffb2b3a1c6..37cabfcdb2bb 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -2073,6 +2073,308 @@ template [[host_name("kernel_soft_max_f32")]] kernel kernel_soft_max_t kerne template [[host_name("kernel_soft_max_f16_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4; template [[host_name("kernel_soft_max_f32_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4; +kernel void kernel_dsv4_hc_split_sinkhorn( + constant ggml_metal_kargs_dsv4_hc_split_sinkhorn & args, + device const float * mixes, + device const float * scale, + device const float * base, + device float * dst, + uint tid [[thread_position_in_grid]]) { + if ((int64_t) tid >= args.n_rows) { + return; + } + + constexpr int HC_MAX = 16; + const int HC = args.n_hc; + if (HC <= 0 || HC > HC_MAX) { + return; + } + + device const float * mix = mixes + ((int64_t) tid)*args.mix_hc; + device float * out = dst + ((int64_t) tid)*args.mix_hc; + + const float epsv = args.eps; + const float pre_scale = scale[0]; + const float post_scale = scale[1]; + const float comb_scale = scale[2]; + + if (HC == 4) { + const float4 pre_z = + *((device const float4 *) mix) * pre_scale + + *((device const float4 *) base); + *((device float4 *) out) = 1.0f / (1.0f + exp(-pre_z)) + epsv; + + const float4 post_z = + *((device const float4 *) (mix + 4)) * post_scale + + *((device const float4 *) (base + 4)); + *((device float4 *) (out + 4)) = 2.0f / (1.0f + exp(-post_z)); + + float4 r0 = + *((device const float4 *) (mix + 8)) * comb_scale + + *((device const float4 *) (base + 8)); + float4 r1 = + *((device const float4 *) (mix + 12)) * comb_scale + + *((device const float4 *) (base + 12)); + float4 r2 = + *((device const float4 *) (mix + 16)) * comb_scale + + *((device const float4 *) (base + 16)); + float4 r3 = + *((device const float4 *) (mix + 20)) * comb_scale + + *((device const float4 *) (base + 20)); + + const float m0 = max(max(r0.x, r0.y), max(r0.z, r0.w)); + const float m1 = max(max(r1.x, r1.y), max(r1.z, r1.w)); + const float m2 = max(max(r2.x, r2.y), max(r2.z, r2.w)); + const float m3 = max(max(r3.x, r3.y), max(r3.z, r3.w)); + + r0 = exp(r0 - m0); + r1 = exp(r1 - m1); + r2 = exp(r2 - m2); + r3 = exp(r3 - m3); + + r0 = r0 * (1.0f / (r0.x + r0.y + r0.z + r0.w)) + epsv; + r1 = r1 * (1.0f / (r1.x + r1.y + r1.z + r1.w)) + epsv; + r2 = r2 * (1.0f / (r2.x + r2.y + r2.z + r2.w)) + epsv; + r3 = r3 * (1.0f / (r3.x + r3.y + r3.z + r3.w)) + epsv; + + float4 col_inv = 1.0f / (r0 + r1 + r2 + r3 + epsv); + r0 *= col_inv; + r1 *= col_inv; + r2 *= col_inv; + r3 *= col_inv; + + for (int iter = 1; iter < args.sinkhorn_iters; ++iter) { + r0 *= 1.0f / (r0.x + r0.y + r0.z + r0.w + epsv); + r1 *= 1.0f / (r1.x + r1.y + r1.z + r1.w + epsv); + r2 *= 1.0f / (r2.x + r2.y + r2.z + r2.w + epsv); + r3 *= 1.0f / (r3.x + r3.y + r3.z + r3.w + epsv); + + col_inv = 1.0f / (r0 + r1 + r2 + r3 + epsv); + r0 *= col_inv; + r1 *= col_inv; + r2 *= col_inv; + r3 *= col_inv; + } + + *((device float4 *) (out + 8)) = r0; + *((device float4 *) (out + 12)) = r1; + *((device float4 *) (out + 16)) = r2; + *((device float4 *) (out + 20)) = r3; + return; + } + + for (int i = 0; i < HC; ++i) { + const float z = mix[i] * pre_scale + base[i]; + out[i] = 1.0f / (1.0f + exp(-z)) + epsv; + } + + for (int i = 0; i < HC; ++i) { + const int off = HC + i; + const float z = mix[off] * post_scale + base[off]; + out[off] = 2.0f / (1.0f + exp(-z)); + } + + float c[HC_MAX*HC_MAX]; + + for (int dst_hc = 0; dst_hc < HC; ++dst_hc) { + float row_max = -INFINITY; + for (int src_hc = 0; src_hc < HC; ++src_hc) { + const int idx = src_hc + dst_hc*HC; + const int off = 2*HC + idx; + const float v = mix[off] * comb_scale + base[off]; + c[idx] = v; + row_max = max(row_max, v); + } + + float row_sum = 0.0f; + for (int src_hc = 0; src_hc < HC; ++src_hc) { + const int idx = src_hc + dst_hc*HC; + const float v = exp(c[idx] - row_max); + c[idx] = v; + row_sum += v; + } + + const float inv_sum = 1.0f / row_sum; + for (int src_hc = 0; src_hc < HC; ++src_hc) { + const int idx = src_hc + dst_hc*HC; + c[idx] = c[idx] * inv_sum + epsv; + } + } + + for (int src_hc = 0; src_hc < HC; ++src_hc) { + float sum = 0.0f; + for (int dst_hc = 0; dst_hc < HC; ++dst_hc) { + sum += c[src_hc + dst_hc*HC]; + } + + const float inv_denom = 1.0f / (sum + epsv); + for (int dst_hc = 0; dst_hc < HC; ++dst_hc) { + c[src_hc + dst_hc*HC] *= inv_denom; + } + } + + for (int iter = 1; iter < args.sinkhorn_iters; ++iter) { + for (int dst_hc = 0; dst_hc < HC; ++dst_hc) { + float sum = 0.0f; + for (int src_hc = 0; src_hc < HC; ++src_hc) { + sum += c[src_hc + dst_hc*HC]; + } + + const float inv_denom = 1.0f / (sum + epsv); + for (int src_hc = 0; src_hc < HC; ++src_hc) { + c[src_hc + dst_hc*HC] *= inv_denom; + } + } + + for (int src_hc = 0; src_hc < HC; ++src_hc) { + float sum = 0.0f; + for (int dst_hc = 0; dst_hc < HC; ++dst_hc) { + sum += c[src_hc + dst_hc*HC]; + } + + const float inv_denom = 1.0f / (sum + epsv); + for (int dst_hc = 0; dst_hc < HC; ++dst_hc) { + c[src_hc + dst_hc*HC] *= inv_denom; + } + } + } + + for (int i = 0; i < HC*HC; ++i) { + out[2*HC + i] = c[i]; + } +} + +kernel void kernel_dsv4_hc_expand( + constant ggml_metal_kargs_dsv4_hc_expand & args, + device const char * block_out, + device const char * residual, + device const char * post, + device const char * comb, + device char * dst, + uint gid [[thread_position_in_grid]]) { + const int64_t n_elem = args.n_embd * args.n_hc * args.n_tokens; + if ((int64_t) gid >= n_elem) { + return; + } + + const int64_t d = ((int64_t) gid) % args.n_embd; + const int64_t tmp = ((int64_t) gid) / args.n_embd; + const int64_t dst_hc = tmp % args.n_hc; + const int64_t t = tmp / args.n_hc; + + const float block_v = *((device const float *) (block_out + d*args.nb_block0 + t*args.nb_block1)); + const float post_v = *((device const float *) (post + dst_hc*args.nb_post0 + t*args.nb_post1)); + + float acc = block_v * post_v; + for (int64_t src_hc = 0; src_hc < args.n_hc; ++src_hc) { + const float comb_v = *((device const float *) (comb + dst_hc*args.nb_comb0 + src_hc*args.nb_comb1 + t*args.nb_comb2)); + const float res_v = *((device const float *) (residual + d*args.nb_res0 + src_hc*args.nb_res1 + t*args.nb_res2)); + acc += comb_v * res_v; + } + + *((device float *) (dst + d*args.nb0 + dst_hc*args.nb1 + t*args.nb2)) = acc; +} + +kernel void kernel_dsv4_hc_weighted_sum( + constant ggml_metal_kargs_dsv4_hc_weighted_sum & args, + device const char * x, + device const char * weights, + device char * dst, + uint gid [[thread_position_in_grid]]) { + const int64_t n_elem = args.n_embd * args.n_tokens; + if ((int64_t) gid >= n_elem) { + return; + } + + const int64_t d = ((int64_t) gid) % args.n_embd; + const int64_t t = ((int64_t) gid) / args.n_embd; + + float acc = 0.0f; + for (int64_t h = 0; h < args.n_hc; ++h) { + const float xv = *((device const float *) (x + d*args.nb_x0 + h*args.nb_x1 + t*args.nb_x2)); + const float wv = *((device const float *) (weights + h*args.nb_w0 + t*args.nb_w1)); + acc += xv * wv; + } + + *((device float *) (dst + d*args.nb0 + t*args.nb1)) = acc; +} + +static inline float dsv4_e4m3fn_value(int i) { + const int exp = (i >> 3) & 0x0f; + const int mant = i & 0x07; + return exp == 0 + ? float(mant) * 0.001953125f + : (1.0f + float(mant) * 0.125f) * exp2(float(exp - 7)); +} + +static inline float dsv4_e4m3fn_dequant(float x) { + const float sign = x < 0.0f ? -1.0f : 1.0f; + const float ax = min(abs(x), 448.0f); + + int best = 0; + float best_diff = ax; + for (int i = 1; i < 127; ++i) { + const float val = dsv4_e4m3fn_value(i); + const float diff = abs(ax - val); + if (diff < best_diff || (diff == best_diff && (i & 1) == 0 && (best & 1) != 0)) { + best = i; + best_diff = diff; + } + } + + return sign * dsv4_e4m3fn_value(best); +} + +kernel void kernel_dsv4_fp8_kv_quantize_f32( + constant ggml_metal_kargs_dsv4_fp8_kv_quantize & args, + device const char * src0, + device char * dst, + threadgroup float * scratch [[threadgroup(0)]], + uint row [[threadgroup_position_in_grid]], + uint tid [[thread_position_in_threadgroup]]) { + const int64_t n_rows = args.ne01 * args.ne02 * args.ne03; + if ((int64_t) row >= n_rows) { + return; + } + + const int64_t i1 = row % args.ne01; + const int64_t i2 = (row / args.ne01) % args.ne02; + const int64_t i3 = row / (args.ne01 * args.ne02); + + device const char * src_base = src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03; + device char * dst_base = dst + i1*args.nb1 + i2*args.nb2 + i3*args.nb3; + + const int64_t n_nope = args.ne00 - args.n_rot; + + for (int64_t off = 0; off < n_nope; off += 64) { + float v = 0.0f; + if (tid < 64) { + v = *((device const float *) (src_base + (off + tid)*args.nb00)); + scratch[tid] = abs(v); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint stride = 32; stride > 0; stride >>= 1) { + if (tid < stride) { + scratch[tid] = max(scratch[tid], scratch[tid + stride]); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + const float amax = max(scratch[0], 1.0e-4f); + const float scale = exp2(ceil(log2(amax / 448.0f))); + if (tid < 64) { + const float q = dsv4_e4m3fn_dequant(clamp(v / scale, -448.0f, 448.0f)) * scale; + *((device float *) (dst_base + (off + tid)*args.nb0)) = q; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + for (int64_t i = n_nope + tid; i < args.ne00; i += 64) { + *((device float *) (dst_base + i*args.nb0)) = *((device const float *) (src_base + i*args.nb00)); + } +} + // ref: ggml.c:ggml_compute_forward_ssm_conv_f32 kernel void kernel_ssm_conv_f32_f32( constant ggml_metal_kargs_ssm_conv & args, @@ -4632,6 +4934,95 @@ template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kerne template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision; template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision; +kernel void kernel_dsv4_rope_tail_f32( + constant ggml_metal_kargs_dsv4_rope_tail & args, + device const char * src0, + device const char * src1, + device const char * src2, + device char * dst, + uint tid [[thread_index_in_threadgroup]], + ushort3 ntg [[threads_per_threadgroup]], + uint3 tgpig [[threadgroup_position_in_grid]]) { + const int i1 = tgpig[0]; + const int i2 = tgpig[1]; + const int i3 = tgpig[2]; + + const int n_nope = args.ne00 - args.n_dims; + if (n_nope < 0) { + return; + } + + device const int32_t * pos = (device const int32_t *) src1; + + float corr_dims[2]; + rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims); + + const float theta_base = (float) pos[i2]; + const float inv_ndims = -1.f/args.n_dims; + const bool is_neox = args.mode == 2; + + for (int i0 = tid; i0 < args.ne00; i0 += ntg.x) { + device const char * src_base = src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01; + device char * dst_base = dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1; + + if (i0 < n_nope) { + *((device float *) (dst_base + i0*args.nb0)) = *((device const float *) (src_base + i0*args.nb00)); + continue; + } + + const int r = i0 - n_nope; + if (is_neox) { + const int n_half = args.n_dims/2; + if (r >= n_half) { + continue; + } + + const int ic = r; + const int rel_i0 = 2*ic; + const float theta = theta_base * pow(args.freq_base, inv_ndims*rel_i0); + const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f; + + float cos_theta; + float sin_theta; + rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, rel_i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta); + if (args.inverse) { + sin_theta = -sin_theta; + } + + const int j0 = n_nope + ic; + const int j1 = n_nope + ic + n_half; + const float x0 = *((device const float *) (src_base + j0*args.nb00)); + const float x1 = *((device const float *) (src_base + j1*args.nb00)); + + *((device float *) (dst_base + j0*args.nb0)) = x0*cos_theta - x1*sin_theta; + *((device float *) (dst_base + j1*args.nb0)) = x0*sin_theta + x1*cos_theta; + } else { + if ((r & 1) != 0) { + continue; + } + + const int ic = r/2; + const float theta = theta_base * pow(args.freq_base, inv_ndims*r); + const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f; + + float cos_theta; + float sin_theta; + rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, r, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta); + if (args.inverse) { + sin_theta = -sin_theta; + } + + const int j0 = n_nope + r; + const int j1 = j0 + 1; + const float x0 = *((device const float *) (src_base + j0*args.nb00)); + const float x1 = *((device const float *) (src_base + j1*args.nb00)); + + *((device float *) (dst_base + j0*args.nb0)) = x0*cos_theta - x1*sin_theta; + *((device float *) (dst_base + j1*args.nb0)) = x0*sin_theta + x1*cos_theta; + } + } +} + typedef void (im2col_t)( constant ggml_metal_kargs_im2col & args, device const float * x, diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 476c30797956..8b06c0bd5a49 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1063,6 +1063,11 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "RWKV_WKV7", "SOLVE_TRI", "GATED_DELTA_NET", + "DSV4_HC_SPLIT_SINKHORN", + "DSV4_HC_WEIGHTED_SUM", + "DSV4_HC_EXPAND", + "DSV4_FP8_KV_QUANTIZE", + "DSV4_ROPE_TAIL", "UNARY", @@ -1080,7 +1085,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GLU", }; -static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96"); +static_assert(GGML_OP_COUNT == 101, "GGML_OP_COUNT != 101"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1173,6 +1178,11 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "rwkv_wkv7(r, w, k, v, a, b, s)", "A X = B, A triangular, solve X", "gated_delta_net(q, k, v, g, beta, s)", + "dsv4_hc_split_sinkhorn(x)", + "dsv4_hc_weighted_sum(x)", + "dsv4_hc_expand(x)", + "dsv4_fp8_kv_quantize(x)", + "dsv4_rope_tail(x)", "unary(x)", @@ -1190,7 +1200,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "glu(x)", }; -static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96"); +static_assert(GGML_OP_COUNT == 101, "GGML_OP_COUNT != 101"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -6230,6 +6240,180 @@ struct ggml_tensor * ggml_gated_delta_net( return result; } +// ggml_dsv4_hc_split_sinkhorn + +struct ggml_tensor * ggml_dsv4_hc_split_sinkhorn( + struct ggml_context * ctx, + struct ggml_tensor * mixes, + struct ggml_tensor * scale, + struct ggml_tensor * base, + int n_hc, + int sinkhorn_iters, + float eps) { + GGML_ASSERT(mixes->type == GGML_TYPE_F32); + GGML_ASSERT(scale->type == GGML_TYPE_F32); + GGML_ASSERT(base->type == GGML_TYPE_F32); + + GGML_ASSERT(ggml_is_contiguous_rows(mixes)); + GGML_ASSERT(ggml_is_contiguous(scale)); + GGML_ASSERT(ggml_is_contiguous(base)); + + GGML_ASSERT(n_hc > 0); + GGML_ASSERT(n_hc <= 16); // CPU forward uses a fixed float c[16*16] scratch + GGML_ASSERT(sinkhorn_iters > 0); + GGML_ASSERT(mixes->ne[0] == (2 + n_hc) * n_hc); + GGML_ASSERT(mixes->ne[2] == 1); + GGML_ASSERT(mixes->ne[3] == 1); + GGML_ASSERT(ggml_nelements(scale) >= 3); + GGML_ASSERT(ggml_nelements(base) >= mixes->ne[0]); + + struct ggml_tensor * result = ggml_dup_tensor(ctx, mixes); + + ggml_set_op_params_i32(result, 0, n_hc); + ggml_set_op_params_i32(result, 1, sinkhorn_iters); + ggml_set_op_params_f32(result, 2, eps); + + result->op = GGML_OP_DSV4_HC_SPLIT_SINKHORN; + result->src[0] = mixes; + result->src[1] = scale; + result->src[2] = base; + + return result; +} + +// ggml_dsv4_hc_weighted_sum + +struct ggml_tensor * ggml_dsv4_hc_weighted_sum( + struct ggml_context * ctx, + struct ggml_tensor * x, + struct ggml_tensor * weights) { + GGML_ASSERT(x->type == GGML_TYPE_F32); + GGML_ASSERT(weights->type == GGML_TYPE_F32); + + GGML_ASSERT(x->ne[1] == weights->ne[0]); + GGML_ASSERT(x->ne[2] == weights->ne[1]); + GGML_ASSERT(x->ne[3] == 1); + GGML_ASSERT(weights->ne[2] == 1); + GGML_ASSERT(weights->ne[3] == 1); + + struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, x->ne[0], x->ne[2]); + + result->op = GGML_OP_DSV4_HC_WEIGHTED_SUM; + result->src[0] = x; + result->src[1] = weights; + + return result; +} + +// ggml_dsv4_hc_expand + +struct ggml_tensor * ggml_dsv4_hc_expand( + struct ggml_context * ctx, + struct ggml_tensor * block_out, + struct ggml_tensor * residual, + struct ggml_tensor * post, + struct ggml_tensor * comb) { + GGML_ASSERT(block_out->type == GGML_TYPE_F32); + GGML_ASSERT(residual->type == GGML_TYPE_F32); + GGML_ASSERT(post->type == GGML_TYPE_F32); + GGML_ASSERT(comb->type == GGML_TYPE_F32); + + GGML_ASSERT(block_out->ne[0] == residual->ne[0]); + GGML_ASSERT(block_out->ne[1] == residual->ne[2]); + GGML_ASSERT(block_out->ne[2] == 1); + GGML_ASSERT(block_out->ne[3] == 1); + GGML_ASSERT(post->ne[0] == residual->ne[1]); + GGML_ASSERT(post->ne[1] == residual->ne[2]); + GGML_ASSERT(post->ne[2] == 1); + GGML_ASSERT(post->ne[3] == 1); + GGML_ASSERT(comb->ne[0] == residual->ne[1]); + GGML_ASSERT(comb->ne[1] == residual->ne[1]); + GGML_ASSERT(comb->ne[2] == residual->ne[2]); + GGML_ASSERT(comb->ne[3] == 1); + + struct ggml_tensor * result = ggml_dup_tensor(ctx, residual); + + result->op = GGML_OP_DSV4_HC_EXPAND; + result->src[0] = block_out; + result->src[1] = residual; + result->src[2] = post; + result->src[3] = comb; + + return result; +} + +// ggml_dsv4_fp8_kv_quantize + +struct ggml_tensor * ggml_dsv4_fp8_kv_quantize( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_rot) { + GGML_ASSERT(a->type == GGML_TYPE_F32); + GGML_ASSERT(n_rot >= 0); + GGML_ASSERT(a->ne[0] > n_rot); + GGML_ASSERT((a->ne[0] - n_rot) % 64 == 0); + + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + + ggml_set_op_params_i32(result, 0, n_rot); + + result->op = GGML_OP_DSV4_FP8_KV_QUANTIZE; + result->src[0] = a; + + return result; +} + +// ggml_dsv4_rope_tail + +struct ggml_tensor * ggml_dsv4_rope_tail( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pos, + struct ggml_tensor * freq_factors, + int n_dims, + int mode, + int n_ctx_orig, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow, + bool inverse) { + GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported"); + GGML_ASSERT(mode == GGML_ROPE_TYPE_NORMAL || mode == GGML_ROPE_TYPE_NEOX); + GGML_ASSERT(a->type == GGML_TYPE_F32 || a->type == GGML_TYPE_F16); + GGML_ASSERT(pos->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_is_vector(pos)); + GGML_ASSERT(a->ne[2] == pos->ne[0]); + GGML_ASSERT(n_dims > 0); + GGML_ASSERT(n_dims <= a->ne[0]); + GGML_ASSERT(n_dims % 2 == 0); + + if (freq_factors) { + GGML_ASSERT(freq_factors->type == GGML_TYPE_F32); + GGML_ASSERT(freq_factors->ne[0] >= n_dims / 2); + } + + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + + int32_t params[16] = { n_dims, mode, n_ctx_orig, inverse ? 1 : 0 }; + memcpy(params + 4, &freq_base, sizeof(float)); + memcpy(params + 5, &freq_scale, sizeof(float)); + memcpy(params + 6, &ext_factor, sizeof(float)); + memcpy(params + 7, &attn_factor, sizeof(float)); + memcpy(params + 8, &beta_fast, sizeof(float)); + memcpy(params + 9, &beta_slow, sizeof(float)); + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_DSV4_ROPE_TAIL; + result->src[0] = a; + result->src[1] = pos; + result->src[2] = freq_factors; + + return result; +} + //////////////////////////////////////////////////////////////////////////////// struct ggml_hash_set ggml_hash_set_new(size_t size) { diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c25f217f990e..8f44c7965e87 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -145,6 +145,10 @@ class LLM: INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step" FULL_ATTENTION_INTERVAL = "{arch}.full_attention_interval" ACTIVATION_SPARSITY_SCALE = "{arch}.activation_sparsity_scale" + HASH_LAYER_COUNT = "{arch}.hash_layer_count" + HYPER_CONNECTION_COUNT = "{arch}.hyper_connection.count" + HYPER_CONNECTION_SINKHORN_ITERS = "{arch}.hyper_connection.sinkhorn_iterations" + HYPER_CONNECTION_EPS = "{arch}.hyper_connection.epsilon" ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx" ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs" EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input" @@ -184,6 +188,10 @@ class Attention: SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers" SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern" TEMPERATURE_SCALE = "{arch}.attention.temperature_scale" + COMPRESS_RATIOS = "{arch}.attention.compress_ratios" + COMPRESS_ROPE_FREQ_BASE = "{arch}.attention.compress_rope_freq_base" + OUTPUT_LORA_RANK = "{arch}.attention.output_lora_rank" + OUTPUT_GROUP_COUNT = "{arch}.attention.output_group_count" class Indexer: HEAD_COUNT = "{arch}.attention.indexer.head_count" @@ -451,6 +459,7 @@ class MODEL_ARCH(IntEnum): DEEPSEEK = auto() DEEPSEEK2 = auto() DEEPSEEK2OCR = auto() + DEEPSEEK4 = auto() CHATGLM = auto() GLM4 = auto() GLM4_MOE = auto() @@ -527,6 +536,9 @@ class MODEL_TENSOR(IntEnum): TOKEN_TYPES = auto() POS_EMBD = auto() OUTPUT = auto() + OUTPUT_HC_BASE = auto() # deepseek4 hyper-connection output + OUTPUT_HC_FN = auto() # deepseek4 hyper-connection output + OUTPUT_HC_SCALE = auto() # deepseek4 hyper-connection output DENSE_2_OUT = auto() # embeddinggemma 2_Dense DENSE_3_OUT = auto() # embeddinggemma 3_Dense OUTPUT_NORM = auto() @@ -650,12 +662,19 @@ class MODEL_TENSOR(IntEnum): CHANNEL_MIX_VALUE = auto() ATTN_Q_A = auto() ATTN_Q_B = auto() + ATTN_KV = auto() # deepseek4 single-tensor combined KV projection ATTN_KV_A_MQA = auto() ATTN_KV_B = auto() ATTN_K_B = auto() ATTN_V_B = auto() + ATTN_OUT_A = auto() # deepseek4 attention output LoRA + ATTN_OUT_B = auto() # deepseek4 attention output LoRA ATTN_Q_A_NORM = auto() ATTN_KV_A_NORM = auto() + ATTN_COMPRESSOR_APE = auto() # deepseek4 attention compressor + ATTN_COMPRESSOR_KV = auto() # deepseek4 attention compressor + ATTN_COMPRESSOR_GATE = auto() # deepseek4 attention compressor + ATTN_COMPRESSOR_NORM = auto() # deepseek4 attention compressor FFN_SUB_NORM = auto() ATTN_SUB_NORM = auto() DEC_ATTN_NORM = auto() @@ -717,6 +736,17 @@ class MODEL_TENSOR(IntEnum): INDEXER_PROJ = auto() INDEXER_ATTN_K = auto() INDEXER_ATTN_Q_B = auto() + INDEXER_COMPRESSOR_APE = auto() # deepseek4 indexer compressor + INDEXER_COMPRESSOR_KV = auto() # deepseek4 indexer compressor + INDEXER_COMPRESSOR_GATE = auto() # deepseek4 indexer compressor + INDEXER_COMPRESSOR_NORM = auto() # deepseek4 indexer compressor + HC_ATTN_BASE = auto() # deepseek4 hyper-connection attention + HC_ATTN_FN = auto() # deepseek4 hyper-connection attention + HC_ATTN_SCALE = auto() # deepseek4 hyper-connection attention + HC_FFN_BASE = auto() # deepseek4 hyper-connection ffn + HC_FFN_FN = auto() # deepseek4 hyper-connection ffn + HC_FFN_SCALE = auto() # deepseek4 hyper-connection ffn + FFN_GATE_TID2EID = auto() # deepseek4 token-id-to-expert-id gating # vision V_MMPROJ = auto() V_MMPROJ_FC = auto() @@ -966,6 +996,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.DEEPSEEK: "deepseek", MODEL_ARCH.DEEPSEEK2: "deepseek2", MODEL_ARCH.DEEPSEEK2OCR: "deepseek2-ocr", + MODEL_ARCH.DEEPSEEK4: "deepseek4", MODEL_ARCH.CHATGLM: "chatglm", MODEL_ARCH.GLM4: "glm4", MODEL_ARCH.GLM4_MOE: "glm4moe", @@ -1042,6 +1073,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POS_EMBD: "position_embd", MODEL_TENSOR.OUTPUT_NORM: "output_norm", MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.OUTPUT_HC_BASE: "output_hc_base", + MODEL_TENSOR.OUTPUT_HC_FN: "output_hc_fn", + MODEL_TENSOR.OUTPUT_HC_SCALE: "output_hc_scale", MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense MODEL_TENSOR.ROPE_FREQS: "rope_freqs", @@ -1164,12 +1198,19 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.CHANNEL_MIX_VALUE: "blk.{bid}.channel_mix_value", MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a", MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b", + MODEL_TENSOR.ATTN_KV: "blk.{bid}.attn_kv", MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa", MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b", MODEL_TENSOR.ATTN_K_B: "blk.{bid}.attn_k_b", MODEL_TENSOR.ATTN_V_B: "blk.{bid}.attn_v_b", + MODEL_TENSOR.ATTN_OUT_A: "blk.{bid}.attn_output_a", + MODEL_TENSOR.ATTN_OUT_B: "blk.{bid}.attn_output_b", MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm", MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm", + MODEL_TENSOR.ATTN_COMPRESSOR_APE: "blk.{bid}.attn_compressor_ape", + MODEL_TENSOR.ATTN_COMPRESSOR_KV: "blk.{bid}.attn_compressor_kv", + MODEL_TENSOR.ATTN_COMPRESSOR_GATE: "blk.{bid}.attn_compressor_gate", + MODEL_TENSOR.ATTN_COMPRESSOR_NORM: "blk.{bid}.attn_compressor_norm", MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm", MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm", MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm", @@ -1231,6 +1272,17 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.INDEXER_PROJ: "blk.{bid}.indexer.proj", MODEL_TENSOR.INDEXER_ATTN_K: "blk.{bid}.indexer.attn_k", MODEL_TENSOR.INDEXER_ATTN_Q_B: "blk.{bid}.indexer.attn_q_b", + MODEL_TENSOR.INDEXER_COMPRESSOR_APE: "blk.{bid}.indexer_compressor_ape", + MODEL_TENSOR.INDEXER_COMPRESSOR_KV: "blk.{bid}.indexer_compressor_kv", + MODEL_TENSOR.INDEXER_COMPRESSOR_GATE: "blk.{bid}.indexer_compressor_gate", + MODEL_TENSOR.INDEXER_COMPRESSOR_NORM: "blk.{bid}.indexer_compressor_norm", + MODEL_TENSOR.HC_ATTN_BASE: "blk.{bid}.hc_attn_base", + MODEL_TENSOR.HC_ATTN_FN: "blk.{bid}.hc_attn_fn", + MODEL_TENSOR.HC_ATTN_SCALE: "blk.{bid}.hc_attn_scale", + MODEL_TENSOR.HC_FFN_BASE: "blk.{bid}.hc_ffn_base", + MODEL_TENSOR.HC_FFN_FN: "blk.{bid}.hc_ffn_fn", + MODEL_TENSOR.HC_FFN_SCALE: "blk.{bid}.hc_ffn_scale", + MODEL_TENSOR.FFN_GATE_TID2EID: "blk.{bid}.ffn_gate_tid2eid", # vision MODEL_TENSOR.V_MMPROJ: "mm.{bid}", MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc", @@ -2928,6 +2980,49 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_UP_SHEXP, MODEL_TENSOR.FFN_EXP_PROBS_B, ], + MODEL_ARCH.DEEPSEEK4: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.OUTPUT_HC_BASE, + MODEL_TENSOR.OUTPUT_HC_FN, + MODEL_TENSOR.OUTPUT_HC_SCALE, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_SINKS, + MODEL_TENSOR.ATTN_Q_A, + MODEL_TENSOR.ATTN_Q_B, + MODEL_TENSOR.ATTN_Q_A_NORM, + MODEL_TENSOR.ATTN_KV, + MODEL_TENSOR.ATTN_KV_A_NORM, + MODEL_TENSOR.ATTN_OUT_A, + MODEL_TENSOR.ATTN_OUT_B, + MODEL_TENSOR.ATTN_COMPRESSOR_APE, + MODEL_TENSOR.ATTN_COMPRESSOR_KV, + MODEL_TENSOR.ATTN_COMPRESSOR_GATE, + MODEL_TENSOR.ATTN_COMPRESSOR_NORM, + MODEL_TENSOR.INDEXER_PROJ, + MODEL_TENSOR.INDEXER_ATTN_Q_B, + MODEL_TENSOR.INDEXER_COMPRESSOR_APE, + MODEL_TENSOR.INDEXER_COMPRESSOR_KV, + MODEL_TENSOR.INDEXER_COMPRESSOR_GATE, + MODEL_TENSOR.INDEXER_COMPRESSOR_NORM, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + MODEL_TENSOR.FFN_EXP_PROBS_B, + MODEL_TENSOR.FFN_GATE_TID2EID, + MODEL_TENSOR.HC_ATTN_BASE, + MODEL_TENSOR.HC_ATTN_FN, + MODEL_TENSOR.HC_ATTN_SCALE, + MODEL_TENSOR.HC_FFN_BASE, + MODEL_TENSOR.HC_FFN_FN, + MODEL_TENSOR.HC_FFN_SCALE, + ], MODEL_ARCH.ERNIE4_5_MOE: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -4147,6 +4242,8 @@ class GGMLQuantizationType(IntEnum): class ExpertGatingFuncType(IntEnum): SOFTMAX = 1 SIGMOID = 2 + SOFTMAX_WEIGHT = 3 + SQRTSOFTPLUS = 4 # TODO: add GGMLFileType from ggml_ftype in ggml.h diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index a101382719d0..4e3c94de12b9 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -868,6 +868,18 @@ def add_moe_latent_size(self, value: int) -> None: def add_nextn_predict_layers(self, count: int) -> None: self.add_uint32(Keys.LLM.NEXTN_PREDICT_LAYERS.format(arch=self.arch), count) + def add_hash_layer_count(self, count: int) -> None: + self.add_uint32(Keys.LLM.HASH_LAYER_COUNT.format(arch=self.arch), count) + + def add_hyper_connection_count(self, count: int) -> None: + self.add_uint32(Keys.LLM.HYPER_CONNECTION_COUNT.format(arch=self.arch), count) + + def add_hyper_connection_sinkhorn_iters(self, count: int) -> None: + self.add_uint32(Keys.LLM.HYPER_CONNECTION_SINKHORN_ITERS.format(arch=self.arch), count) + + def add_hyper_connection_eps(self, value: float) -> None: + self.add_float32(Keys.LLM.HYPER_CONNECTION_EPS.format(arch=self.arch), value) + def add_swin_norm(self, value: bool) -> None: self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value) @@ -952,6 +964,18 @@ def add_attn_temperature_length(self, value: int) -> None: def add_attn_temperature_scale(self, value: float) -> None: self.add_float32(Keys.Attention.TEMPERATURE_SCALE.format(arch=self.arch), value) + def add_attention_compress_ratios(self, values: Sequence[int]) -> None: + self.add_array(Keys.Attention.COMPRESS_RATIOS.format(arch=self.arch), values) + + def add_attention_compress_rope_freq_base(self, value: float) -> None: + self.add_float32(Keys.Attention.COMPRESS_ROPE_FREQ_BASE.format(arch=self.arch), value) + + def add_attention_output_lora_rank(self, value: int) -> None: + self.add_uint32(Keys.Attention.OUTPUT_LORA_RANK.format(arch=self.arch), value) + + def add_attention_output_group_count(self, value: int) -> None: + self.add_uint32(Keys.Attention.OUTPUT_GROUP_COUNT.format(arch=self.arch), value) + def add_pooling_type(self, value: PoolingType) -> None: self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value) diff --git a/models/templates/deepseek-ai-DeepSeek-V4.jinja b/models/templates/deepseek-ai-DeepSeek-V4.jinja new file mode 100644 index 000000000000..44d5b785ec04 --- /dev/null +++ b/models/templates/deepseek-ai-DeepSeek-V4.jinja @@ -0,0 +1,96 @@ +{%- if not add_generation_prompt is defined -%} + {%- set add_generation_prompt = false -%} +{%- endif -%} +{%- if not thinking is defined -%} + {%- if enable_thinking is defined -%} + {%- set thinking = enable_thinking -%} + {%- else -%} + {%- set thinking = false -%} + {%- endif -%} +{%- endif -%} +{%- set dsml_token = '|DSML|' -%} +{%- set thinking_start_token = '' -%} +{%- set thinking_end_token = '' -%} +{%- set tools_header = '## Tools\n\nYou have access to a set of tools to help answer the user question. You can invoke tools by writing a "<' + dsml_token + 'tool_calls>" block like the following:\n\n<' + dsml_token + 'tool_calls>\n<' + dsml_token + 'invoke name="$TOOL_NAME">\n<' + dsml_token + 'parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE\n...\n\n<' + dsml_token + 'invoke name="$TOOL_NAME2">\n...\n\n\n\nString parameters should be specified as is and set `string="true"`. For all other types (numbers, booleans, arrays, objects), pass the value in JSON format and set `string="false"`.\n\nIf thinking_mode is enabled (triggered by ' + thinking_start_token + '), you MUST output your complete reasoning inside ' + thinking_start_token + '...' + thinking_end_token + ' BEFORE any tool calls or final response.\n\nOtherwise, output directly after ' + thinking_end_token + ' with tool calls or final response.\n\n### Available Tool Schemas\n\n' -%} +{%- set tools_footer = '\n\nYou MUST strictly follow the above defined tool name and parameter schemas to invoke tool calls.' -%} +{%- set ns = namespace(system_prompt='', is_first_system=true, pending_assistant=false, pending_tool_result=false) -%} +{%- for message in messages -%} + {%- if message['role'] == 'system' -%} + {%- if ns.is_first_system -%} + {%- set ns.system_prompt = ns.system_prompt + (message['content'] or '') -%} + {%- set ns.is_first_system = false -%} + {%- else -%} + {%- set ns.system_prompt = ns.system_prompt + '\n\n' + (message['content'] or '') -%} + {%- endif -%} + {%- endif -%} +{%- endfor -%} +{%- if tools is defined and tools -%} + {%- set ts = namespace(schemas='') -%} + {%- for tool in tools -%} + {%- if tool['type'] == 'function' -%} + {%- set ts.schemas = ts.schemas + (tool['function'] | tojson) + '\n' -%} + {%- endif -%} + {%- endfor -%} + {%- if ns.system_prompt -%} + {%- set ns.system_prompt = ns.system_prompt + '\n\n' + tools_header + ts.schemas + tools_footer -%} + {%- else -%} + {%- set ns.system_prompt = tools_header + ts.schemas + tools_footer -%} + {%- endif -%} +{%- endif -%} +{{- bos_token -}} +{{- ns.system_prompt -}} +{%- for message in messages -%} + {%- if message['role'] == 'user' -%} + {{- '<|User|>' + (message['content'] or '') -}} + {%- set ns.pending_assistant = true -%} + {%- set ns.pending_tool_result = true -%} + {%- elif message['role'] == 'tool' -%} + {%- if not ns.pending_tool_result -%} + {{- '<|User|>' -}} + {%- endif -%} + {{- '' + (message['content'] or '') + '' -}} + {%- set ns.pending_assistant = true -%} + {%- set ns.pending_tool_result = true -%} + {%- elif message['role'] == 'assistant' -%} + {%- if ns.pending_assistant -%} + {{- '<|Assistant|>' -}} + {%- if thinking and message['reasoning_content'] is defined and message['reasoning_content'] -%} + {{- thinking_start_token + message['reasoning_content'] + thinking_end_token -}} + {%- else -%} + {{- thinking_end_token -}} + {%- endif -%} + {%- endif -%} + {{- (message['content'] or '') -}} + {%- if message['tool_calls'] -%} + {{- '\n\n<' + dsml_token + 'tool_calls>\n' -}} + {%- for tool in message['tool_calls'] -%} + {%- set func = tool['function'] -%} + {{- '<' + dsml_token + 'invoke name="' + func['name'] + '">\n' -}} + {%- set args = func['arguments'] -%} + {%- if args is string -%} + {%- set args = args | from_json -%} + {%- endif -%} + {%- for key, val in args.items() -%} + {%- if val is string -%} + {{- '<' + dsml_token + 'parameter name="' + key + '" string="true">' + val + '\n' -}} + {%- else -%} + {{- '<' + dsml_token + 'parameter name="' + key + '" string="false">' + (val | tojson) + '\n' -}} + {%- endif -%} + {%- endfor -%} + {{- '\n' -}} + {%- endfor -%} + {{- '' -}} + {%- endif -%} + {{- '<|end▁of▁sentence|>' -}} + {%- set ns.pending_assistant = false -%} + {%- set ns.pending_tool_result = false -%} + {%- endif -%} +{%- endfor -%} +{%- if add_generation_prompt and ns.pending_assistant -%} + {{- '<|Assistant|>' -}} + {%- if thinking -%} + {{- thinking_start_token -}} + {%- else -%} + {{- thinking_end_token -}} + {%- endif -%} +{%- endif -%} diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index c9eead18aa39..e789e5a681ae 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -75,6 +75,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_DEEPSEEK, "deepseek" }, { LLM_ARCH_DEEPSEEK2, "deepseek2" }, { LLM_ARCH_DEEPSEEK2OCR, "deepseek2-ocr" }, + { LLM_ARCH_DEEPSEEK4, "deepseek4" }, { LLM_ARCH_CHATGLM, "chatglm" }, { LLM_ARCH_GLM4, "glm4" }, { LLM_ARCH_GLM4_MOE, "glm4moe" }, @@ -209,6 +210,10 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" }, { LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" }, { LLM_KV_FULL_ATTENTION_INTERVAL, "%s.full_attention_interval" }, + { LLM_KV_HASH_LAYER_COUNT, "%s.hash_layer_count" }, + { LLM_KV_HYPER_CONNECTION_COUNT, "%s.hyper_connection.count" }, + { LLM_KV_HYPER_CONNECTION_SINKHORN_ITERS, "%s.hyper_connection.sinkhorn_iterations" }, + { LLM_KV_HYPER_CONNECTION_EPS, "%s.hyper_connection.epsilon" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -243,6 +248,10 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" }, { LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" }, { LLM_KV_ATTENTION_SHARED_KV_LAYERS, "%s.attention.shared_kv_layers" }, + { LLM_KV_ATTENTION_COMPRESS_RATIOS, "%s.attention.compress_ratios" }, + { LLM_KV_ATTENTION_COMPRESS_ROPE_FREQ_BASE, "%s.attention.compress_rope_freq_base" }, + { LLM_KV_ATTENTION_OUTPUT_LORA_RANK, "%s.attention.output_lora_rank" }, + { LLM_KV_ATTENTION_OUTPUT_GROUP_COUNT, "%s.attention.output_group_count" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_COUNT_SWA, "%s.rope.dimension_count_swa" }, @@ -346,6 +355,9 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT_NORM_LFM2, "token_embd_norm" }, // fix for wrong tensor name { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_OUTPUT_HC_BASE, "output_hc_base" }, + { LLM_TENSOR_OUTPUT_HC_FN, "output_hc_fn" }, + { LLM_TENSOR_OUTPUT_HC_SCALE, "output_hc_scale" }, { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, @@ -422,8 +434,15 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" }, { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" }, { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" }, + { LLM_TENSOR_ATTN_KV, "blk.%d.attn_kv" }, { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" }, { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" }, + { LLM_TENSOR_ATTN_OUT_A, "blk.%d.attn_output_a" }, + { LLM_TENSOR_ATTN_OUT_B, "blk.%d.attn_output_b" }, + { LLM_TENSOR_ATTN_COMPRESSOR_APE, "blk.%d.attn_compressor_ape" }, + { LLM_TENSOR_ATTN_COMPRESSOR_KV, "blk.%d.attn_compressor_kv" }, + { LLM_TENSOR_ATTN_COMPRESSOR_GATE, "blk.%d.attn_compressor_gate" }, + { LLM_TENSOR_ATTN_COMPRESSOR_NORM, "blk.%d.attn_compressor_norm" }, { LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "per_layer_token_embd" }, { LLM_TENSOR_PER_LAYER_MODEL_PROJ, "per_layer_model_proj" }, { LLM_TENSOR_PER_LAYER_PROJ_NORM, "per_layer_proj_norm" }, @@ -548,6 +567,17 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" }, { LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" }, { LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" }, + { LLM_TENSOR_INDEXER_COMPRESSOR_APE, "blk.%d.indexer_compressor_ape" }, + { LLM_TENSOR_INDEXER_COMPRESSOR_KV, "blk.%d.indexer_compressor_kv" }, + { LLM_TENSOR_INDEXER_COMPRESSOR_GATE, "blk.%d.indexer_compressor_gate" }, + { LLM_TENSOR_INDEXER_COMPRESSOR_NORM, "blk.%d.indexer_compressor_norm" }, + { LLM_TENSOR_HC_ATTN_BASE, "blk.%d.hc_attn_base" }, + { LLM_TENSOR_HC_ATTN_FN, "blk.%d.hc_attn_fn" }, + { LLM_TENSOR_HC_ATTN_SCALE, "blk.%d.hc_attn_scale" }, + { LLM_TENSOR_HC_FFN_BASE, "blk.%d.hc_ffn_base" }, + { LLM_TENSOR_HC_FFN_FN, "blk.%d.hc_ffn_fn" }, + { LLM_TENSOR_HC_FFN_SCALE, "blk.%d.hc_ffn_scale" }, + { LLM_TENSOR_FFN_GATE_TID2EID, "blk.%d.ffn_gate_tid2eid" }, }; // declare information about the model weight tensors: @@ -566,6 +596,9 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_TOKEN_TYPES, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}}, {LLM_TENSOR_TOKEN_EMBD_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, // do the norms on the first layer (not the input layer) {LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_OUTPUT_HC_BASE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_ADD}}, + {LLM_TENSOR_OUTPUT_HC_FN, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_OUTPUT_HC_SCALE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_SCALE}}, {LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CLS_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, @@ -592,10 +625,15 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_FFN_UP_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_Q_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ATTN_KV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ATTN_OUT_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ATTN_OUT_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ATTN_COMPRESSOR_KV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ATTN_COMPRESSOR_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_SINKS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}}, {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, @@ -757,6 +795,19 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_INDEXER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_INDEXER_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_INDEXER_COMPRESSOR_KV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_INDEXER_COMPRESSOR_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ATTN_COMPRESSOR_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_INDEXER_COMPRESSOR_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_HC_ATTN_BASE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_HC_ATTN_FN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_HC_ATTN_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}}, + {LLM_TENSOR_HC_FFN_BASE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_HC_FFN_FN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_HC_FFN_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}}, + {LLM_TENSOR_FFN_GATE_TID2EID, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}}, + {LLM_TENSOR_ATTN_COMPRESSOR_APE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_INDEXER_COMPRESSOR_APE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, // NextN/MTP tensors are stored per-block (blk.%d.nextn.*) even though only the // last nextn_predict_layers blocks carry them. Classify as LAYER_REPEATING so // the model loader doesn't fault on the block index. @@ -902,6 +953,7 @@ bool llm_arch_supports_sm_tensor(const llm_arch & arch) { case LLM_ARCH_OLMO2: case LLM_ARCH_OLMOE: case LLM_ARCH_DEEPSEEK2: + case LLM_ARCH_DEEPSEEK4: case LLM_ARCH_GLM_DSA: case LLM_ARCH_BITNET: case LLM_ARCH_T5: diff --git a/src/llama-arch.h b/src/llama-arch.h index 89cf16cc37cf..a1dcb037c7a2 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -79,6 +79,7 @@ enum llm_arch { LLM_ARCH_DEEPSEEK, LLM_ARCH_DEEPSEEK2, LLM_ARCH_DEEPSEEK2OCR, + LLM_ARCH_DEEPSEEK4, LLM_ARCH_CHATGLM, LLM_ARCH_GLM4, LLM_ARCH_GLM4_MOE, @@ -213,6 +214,10 @@ enum llm_kv { LLM_KV_TOKEN_SHIFT_COUNT, LLM_KV_INTERLEAVE_MOE_LAYER_STEP, LLM_KV_FULL_ATTENTION_INTERVAL, + LLM_KV_HASH_LAYER_COUNT, + LLM_KV_HYPER_CONNECTION_COUNT, + LLM_KV_HYPER_CONNECTION_SINKHORN_ITERS, + LLM_KV_HYPER_CONNECTION_EPS, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -247,6 +252,10 @@ enum llm_kv { LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, LLM_KV_ATTENTION_INDEXER_TOP_K, LLM_KV_ATTENTION_SHARED_KV_LAYERS, + LLM_KV_ATTENTION_COMPRESS_RATIOS, + LLM_KV_ATTENTION_COMPRESS_ROPE_FREQ_BASE, + LLM_KV_ATTENTION_OUTPUT_LORA_RANK, + LLM_KV_ATTENTION_OUTPUT_GROUP_COUNT, LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_COUNT_SWA, @@ -354,6 +363,9 @@ enum llm_tensor { LLM_TENSOR_DENSE_2_OUT, LLM_TENSOR_DENSE_3_OUT, LLM_TENSOR_OUTPUT, + LLM_TENSOR_OUTPUT_HC_BASE, + LLM_TENSOR_OUTPUT_HC_FN, + LLM_TENSOR_OUTPUT_HC_SCALE, LLM_TENSOR_OUTPUT_NORM, LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name LLM_TENSOR_ROPE_FREQS, @@ -482,12 +494,19 @@ enum llm_tensor { LLM_TENSOR_CHANNEL_MIX_VALUE, LLM_TENSOR_ATTN_Q_A, LLM_TENSOR_ATTN_Q_B, + LLM_TENSOR_ATTN_KV, LLM_TENSOR_ATTN_KV_A_MQA, LLM_TENSOR_ATTN_KV_B, LLM_TENSOR_ATTN_K_B, LLM_TENSOR_ATTN_V_B, + LLM_TENSOR_ATTN_OUT_A, + LLM_TENSOR_ATTN_OUT_B, LLM_TENSOR_ATTN_Q_A_NORM, LLM_TENSOR_ATTN_KV_A_NORM, + LLM_TENSOR_ATTN_COMPRESSOR_APE, + LLM_TENSOR_ATTN_COMPRESSOR_KV, + LLM_TENSOR_ATTN_COMPRESSOR_GATE, + LLM_TENSOR_ATTN_COMPRESSOR_NORM, LLM_TENSOR_ATTN_SUB_NORM, LLM_TENSOR_FFN_SUB_NORM, LLM_TENSOR_DEC_ATTN_NORM, @@ -549,6 +568,17 @@ enum llm_tensor { LLM_TENSOR_INDEXER_PROJ, LLM_TENSOR_INDEXER_ATTN_K, LLM_TENSOR_INDEXER_ATTN_Q_B, + LLM_TENSOR_INDEXER_COMPRESSOR_APE, + LLM_TENSOR_INDEXER_COMPRESSOR_KV, + LLM_TENSOR_INDEXER_COMPRESSOR_GATE, + LLM_TENSOR_INDEXER_COMPRESSOR_NORM, + LLM_TENSOR_HC_ATTN_BASE, + LLM_TENSOR_HC_ATTN_FN, + LLM_TENSOR_HC_ATTN_SCALE, + LLM_TENSOR_HC_FFN_BASE, + LLM_TENSOR_HC_FFN_FN, + LLM_TENSOR_HC_FFN_SCALE, + LLM_TENSOR_FFN_GATE_TID2EID, LLM_TENSOR_NEXTN_EH_PROJ, LLM_TENSOR_NEXTN_EMBED_TOKENS, LLM_TENSOR_NEXTN_ENORM, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index d62abc4009b8..b1b6aa350735 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -420,7 +420,7 @@ void llama_context::sched_reserve() { const int64_t t_start_us = ggml_time_us(); - const uint32_t n_seqs = cparams.n_seq_max; + const uint32_t n_seqs = model.arch == LLM_ARCH_DEEPSEEK4 ? 1 : cparams.n_seq_max; const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); const size_t max_nodes = this->graph_max_nodes(n_tokens); @@ -596,6 +596,22 @@ void llama_context::sched_reserve() { n_nodes_pp = ggml_graph_n_nodes(gf); } + // DeepSeek V4 resumed-prompt chunks use the compressed-attention decode + // graph, which is larger than the position-zero prefill graph. + if (model.arch == LLM_ARCH_DEEPSEEK4 && n_tokens > 1) { + const llama_pos reserve_pos0 = std::min( + cparams.n_ctx > n_tokens ? cparams.n_ctx - n_tokens : n_tokens, + std::max(cparams.n_batch, 8u*n_tokens)); + auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), + model.hparams.no_alloc, nullptr, reserve_pos0); + if (!gf) { + throw std::runtime_error("failed to allocate DeepSeek V4 resumed pp buffers"); + } + + n_splits_pp = std::max(n_splits_pp, ggml_backend_sched_get_n_splits(sched.get())); + n_nodes_pp = std::max(n_nodes_pp, ggml_graph_n_nodes(gf)); + } + // reserve with tg (token generation) graph to get the number of splits and nodes { auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc); @@ -2171,6 +2187,15 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const { if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR || model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) { return std::max(n_tokens * 40, 32u * model.n_tensors()); } + if (model.arch == LLM_ARCH_DEEPSEEK4) { + // DeepSeek V4 has a position-dependent compressed-attention decode path + // that creates many temporary tensor objects, especially when a long + // prompt is split into non-prefill ubatches. The visible graph node + // count is much smaller than the number of GGML objects allocated while + // building those graphs, so reserve a larger metadata arena than the + // generic tensor-count heuristic would provide. + return std::max(524288u, n_tokens * 192 + 64u * model.n_tensors()); + } uint32_t res = std::max(1024u, 8u*model.n_tensors()); for (const auto & lora : model.loras) { res += lora->get_n_nodes(); @@ -2183,7 +2208,7 @@ llm_graph_result * llama_context::get_gf_res_reserve() const { } ggml_cgraph * llama_context::graph_reserve( - uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes) { + uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes, llama_pos pos0) { LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs); GGML_ASSERT(n_outputs >= 1); @@ -2207,6 +2232,14 @@ ggml_cgraph * llama_context::graph_reserve( llama_batch_allocr balloc(model.hparams.n_pos_per_embd()); llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs); + if (pos0 != 0 && ubatch.pos != nullptr) { + for (uint32_t i = 0; i < ubatch.n_tokens; ++i) { + ubatch.pos[i*ubatch.n_pos] = pos0 + i; + for (uint32_t j = 1; j < ubatch.n_pos; ++j) { + ubatch.pos[i*ubatch.n_pos + j] = 0; + } + } + } // set one output token per sequence in order to activate all backend samplers std::vector seq_ids(n_seqs); @@ -3357,6 +3390,29 @@ llama_context * llama_init_from_model( params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED; } + // V4 (DeepSeek4) requires fp16 KV cache: V4's standard SWA K cache, + // compressed-attention K cache (cache.attn_k), and indexer K cache + // (cache.index_k) all share the same `type_k` and must agree in dtype + // because src/models/deepseek4.cpp concatenates the SWA K view with the + // compressed K view via ggml_concat (which asserts a->type == b->type). + // Furthermore, V4's K activations are post-fp8-quantized + // (ggml_dsv4_fp8_kv_quantize), and q8_0's single fp16 scale per 32-element + // block cannot faithfully reproduce fp8-quantized value distributions -- + // pinning to q8_0 corrupts decode silently ("=" loops, "Mirror ..." + // garbage). Coerce here, before the SPLIT_MODE_TENSOR / FA / V-quant + // shared validations below and before the constructor's flash_attn check, + // so those validations see the effective fp16 types and won't reject V4 + // requests with --cache-type-k|v q8_0. See + // docs/plans/v4-port-kv-q8-completion.md. + if (model->arch == LLM_ARCH_DEEPSEEK4) { + if (params.type_k != GGML_TYPE_F16 || params.type_v != GGML_TYPE_F16) { + LLAMA_LOG_WARN("DeepSeek4: forcing fp16 KV cache (--cache-type-k|v are ignored for V4 because compressed/indexer K caches require fp16; " + "see docs/plans/v4-port-kv-q8-completion.md)\n"); + params.type_k = GGML_TYPE_F16; + params.type_v = GGML_TYPE_F16; + } + } + if (model->split_mode() == LLAMA_SPLIT_MODE_TENSOR) { if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) { LLAMA_LOG_INFO("%s: enabling flash_attn since it is required for SPLIT_MODE_TENSOR\n", __func__); diff --git a/src/llama-context.h b/src/llama-context.h index e16ac4c618ba..999ba5a800c5 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -240,7 +240,8 @@ struct llama_context { // reserve a graph with a dummy ubatch of the specified size ggml_cgraph * graph_reserve( - uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr); + uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr, + llama_pos pos0 = 0); bool set_sampler(llama_seq_id seq_id, llama_sampler * sampler); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 858c297dd762..bbb74a0661b4 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -500,29 +500,41 @@ bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) { } void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) { - mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); - mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); + if (self_k_idxs && self_k_idxs->buffer) { + mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); + } + if (self_v_idxs && self_v_idxs->buffer) { + mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); + } - mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + if (self_kq_mask && self_kq_mask->buffer) { + mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + } - mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch); - mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch); + if (self_k_idxs_swa && self_k_idxs_swa->buffer) { + mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch); + } + if (self_v_idxs_swa && self_v_idxs_swa->buffer) { + mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch); + } - mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); + if (self_kq_mask_swa && self_kq_mask_swa->buffer) { + mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); + } - if (self_k_rot) { + if (self_k_rot && self_k_rot->buffer) { mctx->get_base()->set_input_k_rot(self_k_rot); } - if (self_v_rot) { + if (self_v_rot && self_v_rot->buffer) { mctx->get_base()->set_input_v_rot(self_v_rot); } - if (self_k_rot_swa) { + if (self_k_rot_swa && self_k_rot_swa->buffer) { mctx->get_swa()->set_input_k_rot(self_k_rot_swa); } - if (self_v_rot_swa) { + if (self_v_rot_swa && self_v_rot_swa->buffer) { mctx->get_swa()->set_input_v_rot(self_v_rot_swa); } } @@ -534,14 +546,19 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) { bool res = true; - res &= self_k_idxs->ne[0] == params.ubatch.n_tokens; - //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there + if (self_k_idxs && self_k_idxs->buffer) { + res &= self_k_idxs->ne[0] == params.ubatch.n_tokens; + //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there + + res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); + } - res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; - //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there + if (self_k_idxs_swa && self_k_idxs_swa->buffer) { + res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; + //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there - res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); - res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); + res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); + } return res; } @@ -591,7 +608,7 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) { const int64_t n_rs = mctx->get_recr()->get_n_rs(); - if (inp_rs->s_copy) { + if (inp_rs->s_copy && inp_rs->s_copy->buffer) { GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer)); int32_t * data = (int32_t *) inp_rs->s_copy->data; @@ -614,10 +631,12 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) { res &= can_reuse_kq_mask(inp_attn->self_kq_mask, mctx->get_attn(), params.ubatch, params.cparams); - res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs(); + if (inp_rs->s_copy && inp_rs->s_copy->buffer) { + res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs(); - res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs; - res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs; + res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs; + res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs; + } res &= inp_rs->head == mctx->get_recr()->get_head(); res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z(); @@ -635,7 +654,7 @@ void llm_graph_input_mem_hybrid_k::set_input(const llama_ubatch * ubatch) { const int64_t n_rs = mctx->get_recr()->get_n_rs(); - if (inp_rs->s_copy) { + if (inp_rs->s_copy && inp_rs->s_copy->buffer) { GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer)); int32_t * data = (int32_t *) inp_rs->s_copy->data; @@ -657,10 +676,12 @@ bool llm_graph_input_mem_hybrid_k::can_reuse(const llm_graph_params & params) { res &= can_reuse_kq_mask(inp_attn->self_kq_mask, mctx->get_attn(), params.ubatch, params.cparams); - res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs(); + if (inp_rs->s_copy && inp_rs->s_copy->buffer) { + res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs(); - res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs; - res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs; + res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs; + res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs; + } res &= inp_rs->head == mctx->get_recr()->get_head(); res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z(); @@ -674,38 +695,46 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) { // base tensors may not be allocated if there are no non-SWA attention layers if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) { attn_ctx->get_base()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch); - attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch); + if (inp_attn->self_v_idxs && inp_attn->self_v_idxs->buffer) { + attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch); + } - attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn); + if (inp_attn->self_kq_mask && inp_attn->self_kq_mask->buffer) { + attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn); + } } // swa tensors may not be allocated if there are no SWA attention layers if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) { attn_ctx->get_swa()->set_input_k_idxs(inp_attn->self_k_idxs_swa, ubatch); - attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch); + if (inp_attn->self_v_idxs_swa && inp_attn->self_v_idxs_swa->buffer) { + attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch); + } - attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn); + if (inp_attn->self_kq_mask_swa && inp_attn->self_kq_mask_swa->buffer) { + attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn); + } } - if (inp_attn->self_k_rot) { + if (inp_attn->self_k_rot && inp_attn->self_k_rot->buffer) { attn_ctx->get_base()->set_input_k_rot(inp_attn->self_k_rot); } - if (inp_attn->self_v_rot) { + if (inp_attn->self_v_rot && inp_attn->self_v_rot->buffer) { attn_ctx->get_base()->set_input_v_rot(inp_attn->self_v_rot); } - if (inp_attn->self_k_rot_swa) { + if (inp_attn->self_k_rot_swa && inp_attn->self_k_rot_swa->buffer) { attn_ctx->get_swa()->set_input_k_rot(inp_attn->self_k_rot_swa); } - if (inp_attn->self_v_rot_swa) { + if (inp_attn->self_v_rot_swa && inp_attn->self_v_rot_swa->buffer) { attn_ctx->get_swa()->set_input_v_rot(inp_attn->self_v_rot_swa); } const int64_t n_rs = mctx->get_recr()->get_n_rs(); - if (inp_rs->s_copy) { + if (inp_rs->s_copy && inp_rs->s_copy->buffer) { GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer)); int32_t * data = (int32_t *) inp_rs->s_copy->data; @@ -741,10 +770,12 @@ bool llm_graph_input_mem_hybrid_iswa::can_reuse(const llm_graph_params & params) res &= can_reuse_kq_mask(inp_attn->self_kq_mask_swa, attn_ctx->get_swa(), params.ubatch, params.cparams); } - res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs(); + if (inp_rs->s_copy && inp_rs->s_copy->buffer) { + res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs(); - res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs; - res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs; + res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs; + res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs; + } res &= inp_rs->head == mctx->get_recr()->get_head(); res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z(); @@ -1325,7 +1356,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ggml_tensor * gate_up_exps, ggml_tensor * up_exps_s, ggml_tensor * gate_exps_s, - ggml_tensor * down_exps_s) const { + ggml_tensor * down_exps_s, + ggml_tensor * selected_experts_in) const { return build_moe_ffn( cur, gate_inp, /* gate_inp_b */ nullptr, @@ -1345,7 +1377,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn( /* gate_up_exps_b */ nullptr, up_exps_s, gate_exps_s, - down_exps_s + down_exps_s, + selected_experts_in ); } @@ -1372,10 +1405,12 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ggml_tensor * gate_up_exps_b, ggml_tensor * up_exps_s, ggml_tensor * gate_exps_s, - ggml_tensor * down_exps_s) const { + ggml_tensor * down_exps_s, + ggml_tensor * selected_experts_in) const { const int64_t n_embd = cur->ne[0]; const int64_t n_tokens = cur->ne[1]; const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN + const bool weight_before_down = arch == LLM_ARCH_DEEPSEEK4; // DeepSeek V4 applies routed weights after SwiGLU and before w2 ggml_tensor * logits = nullptr; @@ -1401,6 +1436,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn( { probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens] } break; + case LLAMA_EXPERT_GATING_FUNC_TYPE_SQRTSOFTPLUS: + { + probs = ggml_sqrt(ctx0, ggml_softplus(ctx0, logits)); // [n_expert, n_tokens] + } break; case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT: { probs = logits; // [n_expert, n_tokens] @@ -1455,8 +1494,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn( } // select experts - ggml_tensor * selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens] - cb(selected_experts->src[0], "ffn_moe_argsort", il); + ggml_tensor * selected_experts = selected_experts_in; + if (selected_experts == nullptr) { + selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens] + cb(selected_experts->src[0], "ffn_moe_argsort", il); + } cb(selected_experts, "ffn_moe_topk", il); if (arch == LLM_ARCH_GROVEMOE && n_expert != hparams.n_expert) { @@ -1584,6 +1626,25 @@ ggml_tensor * llm_graph_context::build_moe_ffn( switch (type_op) { case LLM_FFN_SILU: if (gate_exps) { + if (arch == LLM_ARCH_DEEPSEEK4 && il >= 0) { + const float limit = hparams.swiglu_clamp_exp[il]; + constexpr float eps = 1e-6f; + if (limit > eps) { + cur = ggml_clamp(ctx0, cur, -INFINITY, limit); + cb(cur, "ffn_moe_gate_clamped", il); + + ggml_tensor * gate_act = ggml_silu(ctx0, cur); + cb(gate_act, "ffn_moe_silu", il); + + up = ggml_clamp(ctx0, up, -limit, limit); + cb(up, "ffn_moe_up_clamped", il); + + cur = ggml_mul(ctx0, gate_act, up); + cb(cur, "ffn_moe_swiglu_limited", il); + break; + } + } + // Step35: per-layer clamp for routed experts if (arch == LLM_ARCH_STEP35 && il >= 0) { const float limit = hparams.swiglu_clamp_exp[il]; @@ -1648,6 +1709,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn( GGML_ABORT("fatal error"); } + if (weight_before_down) { + cur = ggml_mul(ctx0, cur, weights); + cb(cur, "ffn_moe_weighted_swiglu", il); + } + experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens] cb(experts, "ffn_moe_down", il); @@ -1665,7 +1731,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(experts, "ffn_moe_down_scaled", il); } - if (!weight_before_ffn) { + if (!weight_before_ffn && !weight_before_down) { experts = ggml_mul(ctx0, experts, weights); cb(experts, "ffn_moe_weighted", il); } diff --git a/src/llama-graph.h b/src/llama-graph.h index 9e55d0a675e0..260334f7302f 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -849,7 +849,8 @@ struct llm_graph_context { ggml_tensor * gate_up_exps = nullptr, ggml_tensor * up_exps_s = nullptr, ggml_tensor * gate_exps_s = nullptr, - ggml_tensor * down_exps_s = nullptr) const; + ggml_tensor * down_exps_s = nullptr, + ggml_tensor * selected_experts_in = nullptr) const; ggml_tensor * build_moe_ffn( ggml_tensor * cur, @@ -874,7 +875,8 @@ struct llm_graph_context { ggml_tensor * gate_up_exps_b = nullptr, ggml_tensor * up_exps_s = nullptr, ggml_tensor * gate_exps_s = nullptr, - ggml_tensor * down_exps_s = nullptr) const; + ggml_tensor * down_exps_s = nullptr, + ggml_tensor * selected_experts_in = nullptr) const; // // inputs diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 2239309c8fb4..44eaf501f7dc 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -153,6 +153,10 @@ uint32_t llama_hparams::n_embd_v_gqa_max() const { } uint32_t llama_hparams::n_embd_r() const { + if (dsv4_state_size != 0) { + return dsv4_state_size; + } + if (wkv_head_size != 0) { // for RWKV models return token_shift_count * n_embd; @@ -177,6 +181,10 @@ uint32_t llama_hparams::n_embd_r() const { } uint32_t llama_hparams::n_embd_s() const { + if (dsv4_state_size != 0) { + return dsv4_state_size; + } + if (wkv_head_size != 0) { // corresponds to RWKV's wkv_states size return n_embd * wkv_head_size; diff --git a/src/llama-hparams.h b/src/llama-hparams.h index e2d051edc6cd..3a0438283e77 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -14,6 +14,7 @@ enum llama_expert_gating_func_type { LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT = 3, // applied to the router weights instead of the logits + LLAMA_EXPERT_GATING_FUNC_TYPE_SQRTSOFTPLUS = 4, }; enum llama_swa_type { @@ -75,6 +76,8 @@ struct llama_hparams { uint32_t n_layer_dense_lead = 0; uint32_t n_lora_q = 0; uint32_t n_lora_kv = 0; + uint32_t n_lora_o = 0; + uint32_t n_attn_out_groups = 0; uint32_t n_ff_exp = 0; uint32_t n_ff_shexp = 0; uint32_t n_ff_chexp = 0; @@ -91,6 +94,7 @@ struct llama_hparams { uint32_t moe_every_n_layers = 0; uint32_t moe_latent_size = 0; uint32_t nextn_predict_layers = 0; + uint32_t n_hash_layers = 0; bool kv_only_nextn = false; // if true, only the last nextn_predict_layers blocks have a KV cache (MTP head arches) @@ -211,6 +215,14 @@ struct llama_hparams { uint32_t indexer_head_size = 0; uint32_t indexer_top_k = 0; + // DeepSeek V4 hyper-connections and sparse KV compression + uint32_t n_hc = 1; + uint32_t hc_sinkhorn_iters = 0; + float hc_eps = 0.0f; + float compress_rope_freq_base = 0.0f; + uint32_t dsv4_state_size = 0; + std::array attn_compress_ratio; + // qwen3vl deepstack uint32_t n_deepstack_layers = 0; diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp index 26e2cb4270b0..9b9f17903637 100644 --- a/src/llama-kv-cache-iswa.cpp +++ b/src/llama-kv-cache-iswa.cpp @@ -60,14 +60,14 @@ llama_kv_cache_iswa::llama_kv_cache_iswa( LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base); kv_base = std::make_unique( - model, type_k, type_v, + model, hparams, type_k, type_v, v_trans, offload, unified, size_base, n_seq_max, n_pad, 0, LLAMA_SWA_TYPE_NONE, filter_base, reuse); LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa); kv_swa = std::make_unique( - model, type_k, type_v, + model, hparams, type_k, type_v, v_trans, offload, unified, size_swa, n_seq_max, n_pad, hparams.n_swa, hparams.swa_type, filter_swa, reuse); } diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index a49a055a6304..92585b671b55 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -79,6 +79,7 @@ static ggml_tensor * ggml_mul_mat_aux( llama_kv_cache::llama_kv_cache( const llama_model & model, + const llama_hparams & hparams, ggml_type type_k, ggml_type type_v, bool v_trans, @@ -91,7 +92,7 @@ llama_kv_cache::llama_kv_cache( llama_swa_type swa_type, const layer_filter_cb & filter, const layer_reuse_cb & reuse) : - model(model), hparams(model.hparams), v_trans(v_trans), + model(model), hparams(hparams), v_trans(v_trans), n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) { GGML_ASSERT(kv_size % n_pad == 0); @@ -205,7 +206,7 @@ llama_kv_cache::llama_kv_cache( } const bool has_k = true; - const bool has_v = !is_mla; + const bool has_v = !is_mla && model.arch != LLM_ARCH_DEEPSEEK4; ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream) : nullptr; ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream) : nullptr; @@ -253,7 +254,7 @@ llama_kv_cache::llama_kv_cache( // allocate tensors and initialize the buffers to avoid NaNs in the padding for (auto & [buft, ctx] : ctx_map) { ggml_backend_buffer_t buf; - if (model.hparams.no_alloc) { + if (hparams.no_alloc) { buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) { t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 0b62dc7b2320..0b0a56ce92f4 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -95,6 +95,7 @@ class llama_kv_cache : public llama_memory_i { llama_kv_cache( const llama_model & model, + const llama_hparams & hparams, ggml_type type_k, ggml_type type_v, bool v_trans, diff --git a/src/llama-memory-hybrid-iswa.cpp b/src/llama-memory-hybrid-iswa.cpp index a59561ea54dd..58dadabc9f62 100644 --- a/src/llama-memory-hybrid-iswa.cpp +++ b/src/llama-memory-hybrid-iswa.cpp @@ -1,9 +1,113 @@ #include "llama-memory-hybrid-iswa.h" +#include "ggml-backend.h" + #include "llama-impl.h" +#include "llama-io.h" #include "llama-model.h" #include "llama-context.h" +#include +#include +#include +#include +#include +#include + +namespace { + +constexpr uint32_t DSV4_COMPRESSED_KV_STATE_MAGIC = 0x44535634; // "DSV4" +constexpr uint32_t DSV4_COMPRESSED_KV_STATE_VERSION = 1; +constexpr uint32_t DSV4_COMPRESSED_DECODE_UBATCH_MAX = 512; + +struct dsv4_row_range { + uint32_t begin = 0; + uint32_t end = 0; + + uint32_t size() const { + GGML_ASSERT(end >= begin); + return end - begin; + } +}; + +static dsv4_row_range dsv4_make_row_range(uint32_t n_comp, uint32_t ratio, llama_pos p0, llama_pos p1) { + GGML_ASSERT(ratio > 0); + + if (n_comp == 0) { + return {}; + } + + if (p0 < 0) { + p0 = 0; + } + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + if (p0 >= p1) { + return {}; + } + + const uint64_t row_begin = (uint64_t) p0 / ratio; + uint64_t row_end; + if (p1 == std::numeric_limits::max()) { + row_end = n_comp; + } else { + row_end = ((uint64_t) p1 + ratio - 1) / ratio; + } + + return { + (uint32_t) std::min(row_begin, n_comp), + (uint32_t) std::min(row_end, n_comp), + }; +} + +static size_t dsv4_cache_row_size(const ggml_tensor * t) { + GGML_ASSERT(t != nullptr); + + const size_t row_size = ggml_row_size(t->type, t->ne[0]); + GGML_ASSERT((size_t) t->nb[1] == row_size); + GGML_ASSERT((size_t) t->nb[2] == row_size*(size_t) t->ne[1]); + + return row_size; +} + +static size_t dsv4_cache_offset(const ggml_tensor * t, llama_seq_id seq_id, uint32_t row) { + GGML_ASSERT(seq_id >= 0); + GGML_ASSERT(row <= (uint32_t) t->ne[1]); + + return (size_t) seq_id*(size_t) t->nb[2] + (size_t) row*(size_t) t->nb[1]; +} + +static void dsv4_zero_cache_rows(ggml_tensor * t, llama_seq_id seq_id, uint32_t row_start, uint32_t n_rows) { + if (t == nullptr || n_rows == 0) { + return; + } + + const size_t row_size = dsv4_cache_row_size(t); + const size_t n_bytes = (size_t) n_rows*row_size; + const size_t offset = dsv4_cache_offset(t, seq_id, row_start); + + std::vector zeros(n_bytes, 0); + ggml_backend_tensor_set(t, zeros.data(), offset, n_bytes); +} + +static void dsv4_copy_cache_rows(ggml_tensor * t, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, uint32_t row_start, uint32_t n_rows) { + if (t == nullptr || n_rows == 0 || seq_id_src == seq_id_dst) { + return; + } + + const size_t row_size = dsv4_cache_row_size(t); + const size_t n_bytes = (size_t) n_rows*row_size; + const size_t src_offset = dsv4_cache_offset(t, seq_id_src, row_start); + const size_t dst_offset = dsv4_cache_offset(t, seq_id_dst, row_start); + + std::vector tmp(n_bytes); + ggml_backend_tensor_get(t, tmp.data(), src_offset, n_bytes); + ggml_backend_tensor_set(t, tmp.data(), dst_offset, n_bytes); +} + +} // namespace + // // llama_memory_hybrid_iswa // @@ -59,9 +163,103 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa( filter_recr == nullptr ? [&](int32_t il) { return hparams.is_recurrent(il); } : filter_recr - )) {} + )) { + if (model.arch != LLM_ARCH_DEEPSEEK4) { + return; + } + + dsv4_n_seq_max = n_seq_max; + dsv4_cache_layers.resize(hparams.n_layer); + + struct ggml_backend_buft_comparator { + bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const { + return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0; + } + }; + std::map ctx_map; + + auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { + auto it = ctx_map.find(buft); + if (it == ctx_map.end()) { + ggml_init_params params = { + /*.mem_size =*/ size_t(2u*hparams.n_layer*ggml_tensor_overhead()), + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }; + + ggml_context * ctx = ggml_init(params); + if (!ctx) { + return nullptr; + } + + ctx_map.emplace(buft, ctx); + return ctx; + } + + return it->second.get(); + }; + + for (uint32_t il = 0; il < hparams.n_layer; ++il) { + const uint32_t ratio = hparams.attn_compress_ratio[il]; + if (ratio == 0) { + continue; + } + + const uint32_t n_comp = std::max(1, (kv_size + ratio - 1) / ratio); + + const char * dev_name = "CPU"; + ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type(); + + if (offload) { + auto * dev = model.dev_layer(il); + buft = ggml_backend_dev_buffer_type(dev); + dev_name = ggml_backend_dev_name(dev); + } + + LLAMA_LOG_DEBUG("%s: DeepSeek4 compressed KV layer %3d: dev = %s, ratio = %u, rows = %u\n", + __func__, il, dev_name, ratio, n_comp); + + ggml_context * ctx = ctx_for_buft(buft); + if (!ctx) { + throw std::runtime_error("failed to create ggml context for DeepSeek4 compressed KV cache"); + } + + auto & cache = dsv4_cache_layers[il]; + cache.n_comp = n_comp; + cache.attn_k = ggml_new_tensor_3d(ctx, type_k, hparams.n_embd_head_k(il), n_comp, dsv4_n_seq_max); + ggml_format_name(cache.attn_k, "cache_dsv4_attn_k_l%d", il); + + if (ratio == 4) { + cache.index_k = ggml_new_tensor_3d(ctx, type_k, hparams.indexer_head_size, n_comp, dsv4_n_seq_max); + ggml_format_name(cache.index_k, "cache_dsv4_index_k_l%d", il); + } + } + + for (auto & [buft, ctx] : ctx_map) { + ggml_backend_buffer_t buf; + if (model.hparams.no_alloc) { + buf = ggml_backend_buft_alloc_buffer(buft, 0); + for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) { + t->buffer = buf; + } + } else { + buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); + } + if (!buf) { + throw std::runtime_error("failed to allocate DeepSeek4 compressed KV cache buffer"); + } + + LLAMA_LOG_INFO("%s: %10s DeepSeek4 compressed KV buffer size = %8.2f MiB\n", __func__, + ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); + + ggml_backend_buffer_clear(buf, 0); + dsv4_ctxs_bufs.emplace_back(std::move(ctx), buf); + } +} llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { + const bool dsv4_compressed = has_dsv4_compressed_kv(); + do { balloc.split_reset(); @@ -71,7 +269,23 @@ llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr while (true) { llama_ubatch ubatch; - if (embd_all) { + if (dsv4_compressed) { + // DeepSeek V4 compressed attention keeps sequence-local compressor + // state and compressed cache rows. Process one sequence set per + // ubatch while still allowing multi-sequence batches at the API + // level. + uint32_t n_ubatch_dsv4 = n_ubatch; + const auto & batch = balloc.get_batch(); + const bool first_split = balloc.get_n_used() == 0; + const bool starts_at_zero = batch.pos == nullptr || batch.pos[0] == 0; + if (!first_split || !starts_at_zero) { + // Non-prefill compressed-attention chunks build one + // compressor update per token and can otherwise exhaust the + // graph metadata arena on long contexts. + n_ubatch_dsv4 = std::min(n_ubatch_dsv4, DSV4_COMPRESSED_DECODE_UBATCH_MAX); + } + ubatch = balloc.split_seq(n_ubatch_dsv4); + } else if (embd_all) { // if all tokens are output, split by sequence ubatch = balloc.split_seq(n_ubatch); } else { @@ -128,6 +342,10 @@ llama_memory_context_ptr llama_memory_hybrid_iswa::init_update(llama_context * l } bool llama_memory_hybrid_iswa::get_can_shift() const { + if (has_dsv4_compressed_kv()) { + return false; + } + // Shifting is trivially supported for recurrent return mem_attn->get_can_shift(); } @@ -135,6 +353,12 @@ bool llama_memory_hybrid_iswa::get_can_shift() const { void llama_memory_hybrid_iswa::clear(bool data) { mem_attn->clear(data); mem_recr->clear(data); + + if (data) { + for (auto & [_, buf] : dsv4_ctxs_bufs) { + ggml_backend_buffer_clear(buf.get(), 0); + } + } } bool llama_memory_hybrid_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { @@ -143,25 +367,39 @@ bool llama_memory_hybrid_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_p if (!mem_recr->seq_rm(seq_id, p0, p1)) { return false; } - return mem_attn->seq_rm(seq_id, p0, p1); + if (!mem_attn->seq_rm(seq_id, p0, p1)) { + return false; + } + dsv4_seq_rm(seq_id, p0, p1); + return true; } void llama_memory_hybrid_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { mem_attn->seq_cp(seq_id_src, seq_id_dst, p0, p1); mem_recr->seq_cp(seq_id_src, seq_id_dst, p0, p1); + dsv4_seq_cp(seq_id_src, seq_id_dst, p0, p1); } void llama_memory_hybrid_iswa::seq_keep(llama_seq_id seq_id) { mem_attn->seq_keep(seq_id); mem_recr->seq_keep(seq_id); + dsv4_seq_keep(seq_id); } void llama_memory_hybrid_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) { + if (has_dsv4_compressed_kv() && shift != 0) { + GGML_ABORT("DeepSeek V4 compressed KV cache does not support K-shift"); + } + mem_attn->seq_add(seq_id, p0, p1, shift); mem_recr->seq_add(seq_id, p0, p1, shift); } void llama_memory_hybrid_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { + if (has_dsv4_compressed_kv() && d != 1) { + GGML_ABORT("DeepSeek V4 compressed KV cache does not support position division"); + } + mem_attn->seq_div(seq_id, p0, p1, d); mem_recr->seq_div(seq_id, p0, p1, d); } @@ -181,17 +419,383 @@ std::map llama_memory_hybrid_iswa::memory_br for (const auto & buft_size : mem_recr->memory_breakdown()) { mb[buft_size.first] += buft_size.second; } + for (const auto & [_, buf] : dsv4_ctxs_bufs) { + mb[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get()); + } return mb; } void llama_memory_hybrid_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const { mem_attn->state_write(io, seq_id, flags); mem_recr->state_write(io, seq_id, flags); + dsv4_state_write(io, seq_id); } void llama_memory_hybrid_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) { mem_attn->state_read(io, seq_id, flags); mem_recr->state_read(io, seq_id, flags); + dsv4_state_read(io, seq_id); +} + +void llama_memory_hybrid_iswa::dsv4_seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + if (!has_dsv4_compressed_kv()) { + return; + } + + if (seq_id >= 0) { + GGML_ASSERT((uint32_t) seq_id < dsv4_n_seq_max); + for (int32_t il = 0; il < (int32_t) dsv4_cache_layers.size(); ++il) { + dsv4_clear_rows(seq_id, il, p0, p1); + } + return; + } + + for (uint32_t seq = 0; seq < dsv4_n_seq_max; ++seq) { + for (int32_t il = 0; il < (int32_t) dsv4_cache_layers.size(); ++il) { + dsv4_clear_rows(seq, il, p0, p1); + } + } +} + +void llama_memory_hybrid_iswa::dsv4_seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { + if (!has_dsv4_compressed_kv() || seq_id_src == seq_id_dst) { + return; + } + + GGML_ASSERT(seq_id_src >= 0 && (uint32_t) seq_id_src < dsv4_n_seq_max); + GGML_ASSERT(seq_id_dst >= 0 && (uint32_t) seq_id_dst < dsv4_n_seq_max); + + for (int32_t il = 0; il < (int32_t) dsv4_cache_layers.size(); ++il) { + dsv4_copy_rows(seq_id_src, seq_id_dst, il, p0, p1); + } +} + +void llama_memory_hybrid_iswa::dsv4_seq_keep(llama_seq_id seq_id) { + if (!has_dsv4_compressed_kv()) { + return; + } + + GGML_ASSERT(seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max); + + for (uint32_t seq = 0; seq < dsv4_n_seq_max; ++seq) { + if ((llama_seq_id) seq == seq_id) { + continue; + } + + dsv4_clear_seq(seq); + } +} + +void llama_memory_hybrid_iswa::dsv4_clear_seq(llama_seq_id seq_id) { + GGML_ASSERT(seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max); + + for (const auto & layer : dsv4_cache_layers) { + dsv4_zero_cache_rows(layer.attn_k, seq_id, 0, layer.n_comp); + dsv4_zero_cache_rows(layer.index_k, seq_id, 0, layer.n_comp); + } +} + +void llama_memory_hybrid_iswa::dsv4_clear_rows(llama_seq_id seq_id, int32_t il, llama_pos p0, llama_pos p1) { + GGML_ASSERT(seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max); + GGML_ASSERT(il >= 0 && il < (int32_t) dsv4_cache_layers.size()); + + const uint32_t ratio = hparams.attn_compress_ratio[il]; + if (ratio == 0) { + return; + } + + const auto & layer = dsv4_cache_layers[il]; + const auto range = dsv4_make_row_range(layer.n_comp, ratio, p0, p1); + + dsv4_zero_cache_rows(layer.attn_k, seq_id, range.begin, range.size()); + dsv4_zero_cache_rows(layer.index_k, seq_id, range.begin, range.size()); +} + +void llama_memory_hybrid_iswa::dsv4_copy_rows(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, int32_t il, llama_pos p0, llama_pos p1) { + GGML_ASSERT(seq_id_src >= 0 && (uint32_t) seq_id_src < dsv4_n_seq_max); + GGML_ASSERT(seq_id_dst >= 0 && (uint32_t) seq_id_dst < dsv4_n_seq_max); + GGML_ASSERT(il >= 0 && il < (int32_t) dsv4_cache_layers.size()); + + const uint32_t ratio = hparams.attn_compress_ratio[il]; + if (ratio == 0) { + return; + } + + const auto & layer = dsv4_cache_layers[il]; + const auto range = dsv4_make_row_range(layer.n_comp, ratio, p0, p1); + + dsv4_copy_cache_rows(layer.attn_k, seq_id_src, seq_id_dst, range.begin, range.size()); + dsv4_copy_cache_rows(layer.index_k, seq_id_src, seq_id_dst, range.begin, range.size()); +} + +uint32_t llama_memory_hybrid_iswa::dsv4_n_state_rows(int32_t il, llama_seq_id seq_id) const { + GGML_ASSERT(seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max); + GGML_ASSERT(il >= 0 && il < (int32_t) dsv4_cache_layers.size()); + + const uint32_t ratio = hparams.attn_compress_ratio[il]; + if (ratio == 0) { + return 0; + } + + const llama_pos pos_max = mem_attn->seq_pos_max(seq_id); + if (pos_max < 0) { + return 0; + } + + const uint64_t n_rows = ((uint64_t) pos_max + 1) / ratio; + return (uint32_t) std::min(n_rows, dsv4_cache_layers[il].n_comp); +} + +void llama_memory_hybrid_iswa::dsv4_state_write(llama_io_write_i & io, llama_seq_id seq_id) const { + if (!has_dsv4_compressed_kv()) { + return; + } + + GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max)); + + std::vector seq_ids; + auto seq_has_rows = [&](llama_seq_id seq) { + for (int32_t il = 0; il < (int32_t) dsv4_cache_layers.size(); ++il) { + if (dsv4_n_state_rows(il, seq) > 0) { + return true; + } + } + return false; + }; + + if (seq_id >= 0) { + if (seq_has_rows(seq_id)) { + seq_ids.push_back(seq_id); + } + } else { + for (uint32_t seq = 0; seq < dsv4_n_seq_max; ++seq) { + if (seq_has_rows(seq)) { + seq_ids.push_back(seq); + } + } + } + + const uint32_t magic = DSV4_COMPRESSED_KV_STATE_MAGIC; + const uint32_t version = DSV4_COMPRESSED_KV_STATE_VERSION; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_seq = seq_ids.size(); + + io.write(&magic, sizeof(magic)); + io.write(&version, sizeof(version)); + io.write(&n_layer, sizeof(n_layer)); + io.write(&n_seq, sizeof(n_seq)); + + for (uint32_t il = 0; il < n_layer; ++il) { + const auto & layer = dsv4_cache_layers[il]; + + const uint32_t n_comp = layer.n_comp; + io.write(&n_comp, sizeof(n_comp)); + + const uint32_t has_attn = layer.attn_k != nullptr; + io.write(&has_attn, sizeof(has_attn)); + if (has_attn) { + const int32_t type_i = (int32_t) layer.attn_k->type; + const uint64_t row_size = dsv4_cache_row_size(layer.attn_k); + io.write(&type_i, sizeof(type_i)); + io.write(&row_size, sizeof(row_size)); + } + + const uint32_t has_index = layer.index_k != nullptr; + io.write(&has_index, sizeof(has_index)); + if (has_index) { + const int32_t type_i = (int32_t) layer.index_k->type; + const uint64_t row_size = dsv4_cache_row_size(layer.index_k); + io.write(&type_i, sizeof(type_i)); + io.write(&row_size, sizeof(row_size)); + } + } + + for (llama_seq_id seq : seq_ids) { + io.write(&seq, sizeof(seq)); + + for (uint32_t il = 0; il < n_layer; ++il) { + const auto & layer = dsv4_cache_layers[il]; + const uint32_t n_rows = dsv4_n_state_rows(il, seq); + + if (layer.attn_k != nullptr) { + const uint64_t row_size = dsv4_cache_row_size(layer.attn_k); + io.write(&n_rows, sizeof(n_rows)); + if (n_rows > 0) { + io.write_tensor(layer.attn_k, dsv4_cache_offset(layer.attn_k, seq, 0), (size_t) n_rows*row_size); + } + } + + if (layer.index_k != nullptr) { + const uint64_t row_size = dsv4_cache_row_size(layer.index_k); + io.write(&n_rows, sizeof(n_rows)); + if (n_rows > 0) { + io.write_tensor(layer.index_k, dsv4_cache_offset(layer.index_k, seq, 0), (size_t) n_rows*row_size); + } + } + } + } +} + +void llama_memory_hybrid_iswa::dsv4_state_read(llama_io_read_i & io, llama_seq_id seq_id) { + if (!has_dsv4_compressed_kv()) { + return; + } + + GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max)); + + uint32_t magic; + uint32_t version; + uint32_t n_layer; + uint32_t n_seq; + + io.read(&magic, sizeof(magic)); + io.read(&version, sizeof(version)); + io.read(&n_layer, sizeof(n_layer)); + io.read(&n_seq, sizeof(n_seq)); + + if (magic != DSV4_COMPRESSED_KV_STATE_MAGIC) { + throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: bad magic"); + } + if (version != DSV4_COMPRESSED_KV_STATE_VERSION) { + throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: bad version"); + } + if (n_layer != hparams.n_layer || n_layer != dsv4_cache_layers.size()) { + throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: mismatched layer count"); + } + + struct layer_meta { + uint32_t n_comp = 0; + bool has_attn = false; + int32_t attn_type = -1; + uint64_t attn_row_size = 0; + bool has_index = false; + int32_t index_type = -1; + uint64_t index_row_size = 0; + }; + + std::vector meta(n_layer); + for (uint32_t il = 0; il < n_layer; ++il) { + auto & m = meta[il]; + const auto & layer = dsv4_cache_layers[il]; + + io.read(&m.n_comp, sizeof(m.n_comp)); + + uint32_t has_attn; + io.read(&has_attn, sizeof(has_attn)); + m.has_attn = has_attn != 0; + if (m.has_attn) { + io.read(&m.attn_type, sizeof(m.attn_type)); + io.read(&m.attn_row_size, sizeof(m.attn_row_size)); + } + + uint32_t has_index; + io.read(&has_index, sizeof(has_index)); + m.has_index = has_index != 0; + if (m.has_index) { + io.read(&m.index_type, sizeof(m.index_type)); + io.read(&m.index_row_size, sizeof(m.index_row_size)); + } + + const bool local_has_attn = layer.attn_k != nullptr; + const bool local_has_index = layer.index_k != nullptr; + + if (m.n_comp != layer.n_comp || m.has_attn != local_has_attn || m.has_index != local_has_index) { + throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: mismatched layer layout"); + } + if (local_has_attn) { + const int32_t type_i = (int32_t) layer.attn_k->type; + const uint64_t row_size = dsv4_cache_row_size(layer.attn_k); + if (m.attn_type != type_i || m.attn_row_size != row_size) { + throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: mismatched attention cache type"); + } + } + if (local_has_index) { + const int32_t type_i = (int32_t) layer.index_k->type; + const uint64_t row_size = dsv4_cache_row_size(layer.index_k); + if (m.index_type != type_i || m.index_row_size != row_size) { + throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: mismatched index cache type"); + } + } + } + + if (seq_id == -1) { + for (auto & [_, buf] : dsv4_ctxs_bufs) { + ggml_backend_buffer_clear(buf.get(), 0); + } + } else { + dsv4_clear_seq(seq_id); + } + + // scratch buffer for skipping additional blocks in targeted-restore mode + std::vector skip_buf; + + // For targeted restore (seq_id != -1) we follow the public API contract + // documented at llama.h:836 (llama_state_seq_set_data) and exercised by + // examples/save-load-state/save-load-state.cpp: the first encountered + // serialized block is REMAPPED into the requested destination seq_id. + // Subsequent blocks (atypical multi-seq payloads) are skipped to avoid + // silently merging multiple source sequences into one destination. + bool restored_one = false; + + for (uint32_t is = 0; is < n_seq; ++is) { + llama_seq_id src_seq_id; + io.read(&src_seq_id, sizeof(src_seq_id)); + + const bool skip_block = (seq_id != -1 && restored_one); + + const llama_seq_id dst_seq_id = (seq_id == -1) ? src_seq_id : seq_id; + if (!skip_block && (dst_seq_id < 0 || (uint32_t) dst_seq_id >= dsv4_n_seq_max)) { + throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: invalid sequence id"); + } + + for (uint32_t il = 0; il < n_layer; ++il) { + const auto & layer = dsv4_cache_layers[il]; + + if (layer.attn_k != nullptr) { + uint32_t n_rows; + io.read(&n_rows, sizeof(n_rows)); + if (n_rows > layer.n_comp) { + throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: too many attention rows"); + } + if (n_rows > 0) { + const size_t row_size = dsv4_cache_row_size(layer.attn_k); + const size_t nbytes = (size_t) n_rows * row_size; + if (skip_block) { + // advance io past this block's bytes without restoring + skip_buf.resize(nbytes); + io.read(skip_buf.data(), nbytes); + } else { + io.read_tensor(layer.attn_k, + dsv4_cache_offset(layer.attn_k, dst_seq_id, 0), nbytes); + } + } + } + + if (layer.index_k != nullptr) { + uint32_t n_rows; + io.read(&n_rows, sizeof(n_rows)); + if (n_rows > layer.n_comp) { + throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: too many index rows"); + } + if (n_rows > 0) { + const size_t row_size = dsv4_cache_row_size(layer.index_k); + const size_t nbytes = (size_t) n_rows * row_size; + if (skip_block) { + // advance io past this block's bytes without restoring + skip_buf.resize(nbytes); + io.read(skip_buf.data(), nbytes); + } else { + io.read_tensor(layer.index_k, + dsv4_cache_offset(layer.index_k, dst_seq_id, 0), nbytes); + } + } + } + } + + if (!skip_block && seq_id != -1) { + restored_one = true; + } + } } llama_kv_cache_iswa * llama_memory_hybrid_iswa::get_mem_attn() const { @@ -202,6 +806,41 @@ llama_memory_recurrent * llama_memory_hybrid_iswa::get_mem_recr() const { return mem_recr.get(); } +bool llama_memory_hybrid_iswa::has_dsv4_compressed_kv() const { + for (const auto & layer : dsv4_cache_layers) { + if (layer.n_comp != 0) { + return true; + } + } + + return false; +} + +uint32_t llama_memory_hybrid_iswa::get_dsv4_n_comp(int32_t il) const { + GGML_ASSERT(il >= 0 && il < (int32_t) dsv4_cache_layers.size()); + return dsv4_cache_layers[il].n_comp; +} + +ggml_tensor * llama_memory_hybrid_iswa::get_dsv4_attn_k(ggml_context * ctx, int32_t il, llama_seq_id seq_id) const { + GGML_ASSERT(il >= 0 && il < (int32_t) dsv4_cache_layers.size()); + GGML_ASSERT(seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max); + + ggml_tensor * t = dsv4_cache_layers[il].attn_k; + GGML_ASSERT(t != nullptr); + + return ggml_view_2d(ctx, t, t->ne[0], t->ne[1], t->nb[1], seq_id*t->nb[2]); +} + +ggml_tensor * llama_memory_hybrid_iswa::get_dsv4_index_k(ggml_context * ctx, int32_t il, llama_seq_id seq_id) const { + GGML_ASSERT(il >= 0 && il < (int32_t) dsv4_cache_layers.size()); + GGML_ASSERT(seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max); + + ggml_tensor * t = dsv4_cache_layers[il].index_k; + GGML_ASSERT(t != nullptr); + + return ggml_view_2d(ctx, t, t->ne[0], t->ne[1], t->nb[1], seq_id*t->nb[2]); +} + // // llama_memory_hybrid_iswa_context // @@ -209,6 +848,7 @@ llama_memory_recurrent * llama_memory_hybrid_iswa::get_mem_recr() const { llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(llama_memory_status status) : status(status) {} llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(llama_memory_hybrid_iswa * mem) : + mem(mem), ctx_attn(mem->get_mem_attn()->init_full()), ctx_recr(mem->get_mem_recr()->init_full()), status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) { @@ -218,6 +858,7 @@ llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context( llama_memory_hybrid_iswa * mem, llama_context * lctx, bool optimize) : + mem(mem), ctx_attn(mem->get_mem_attn()->init_update(lctx, optimize)), ctx_recr(mem->get_mem_recr()->init_update(lctx, optimize)), status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) { @@ -228,6 +869,7 @@ llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context( slot_info_vec_t sinfos_base, slot_info_vec_t sinfos_swa, std::vector ubatches) : + mem(mem), ubatches(std::move(ubatches)), // note: here we copy the ubatches. not sure if this is ideal ctx_attn(new llama_kv_cache_iswa_context(mem->get_mem_attn(), std::move(sinfos_base), std::move(sinfos_swa), this->ubatches)), @@ -275,3 +917,22 @@ const llama_kv_cache_iswa_context * llama_memory_hybrid_iswa_context::get_attn() const llama_memory_recurrent_context * llama_memory_hybrid_iswa_context::get_recr() const { return static_cast(ctx_recr.get()); } + +bool llama_memory_hybrid_iswa_context::has_dsv4_compressed_kv() const { + return mem != nullptr && mem->has_dsv4_compressed_kv(); +} + +uint32_t llama_memory_hybrid_iswa_context::get_dsv4_n_comp(int32_t il) const { + GGML_ASSERT(mem != nullptr); + return mem->get_dsv4_n_comp(il); +} + +ggml_tensor * llama_memory_hybrid_iswa_context::get_dsv4_attn_k(ggml_context * ctx, int32_t il, llama_seq_id seq_id) const { + GGML_ASSERT(mem != nullptr); + return mem->get_dsv4_attn_k(ctx, il, seq_id); +} + +ggml_tensor * llama_memory_hybrid_iswa_context::get_dsv4_index_k(ggml_context * ctx, int32_t il, llama_seq_id seq_id) const { + GGML_ASSERT(mem != nullptr); + return mem->get_dsv4_index_k(ctx, il, seq_id); +} diff --git a/src/llama-memory-hybrid-iswa.h b/src/llama-memory-hybrid-iswa.h index c9d3f9f57c50..8d11f081615c 100644 --- a/src/llama-memory-hybrid-iswa.h +++ b/src/llama-memory-hybrid-iswa.h @@ -83,11 +83,39 @@ class llama_memory_hybrid_iswa : public llama_memory_i { llama_kv_cache_iswa * get_mem_attn() const; llama_memory_recurrent * get_mem_recr() const; + bool has_dsv4_compressed_kv() const; + uint32_t get_dsv4_n_comp(int32_t il) const; + ggml_tensor * get_dsv4_attn_k (ggml_context * ctx, int32_t il, llama_seq_id seq_id) const; + ggml_tensor * get_dsv4_index_k(ggml_context * ctx, int32_t il, llama_seq_id seq_id) const; + private: const llama_hparams & hparams; const std::unique_ptr mem_attn; const std::unique_ptr mem_recr; + + struct dsv4_cache_layer { + uint32_t n_comp = 0; + ggml_tensor * attn_k = nullptr; + ggml_tensor * index_k = nullptr; + }; + + uint32_t dsv4_n_seq_max = 0; + std::vector dsv4_cache_layers; + std::vector> dsv4_ctxs_bufs; + + void dsv4_seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1); + void dsv4_seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1); + void dsv4_seq_keep(llama_seq_id seq_id); + + void dsv4_clear_seq(llama_seq_id seq_id); + void dsv4_clear_rows(llama_seq_id seq_id, int32_t il, llama_pos p0, llama_pos p1); + void dsv4_copy_rows (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, int32_t il, llama_pos p0, llama_pos p1); + + uint32_t dsv4_n_state_rows(int32_t il, llama_seq_id seq_id) const; + + void dsv4_state_write(llama_io_write_i & io, llama_seq_id seq_id) const; + void dsv4_state_read (llama_io_read_i & io, llama_seq_id seq_id); }; class llama_memory_hybrid_iswa_context : public llama_memory_context_i { @@ -128,7 +156,14 @@ class llama_memory_hybrid_iswa_context : public llama_memory_context_i { const llama_kv_cache_iswa_context * get_attn() const; const llama_memory_recurrent_context * get_recr() const; + bool has_dsv4_compressed_kv() const; + uint32_t get_dsv4_n_comp(int32_t il) const; + ggml_tensor * get_dsv4_attn_k (ggml_context * ctx, int32_t il, llama_seq_id seq_id) const; + ggml_tensor * get_dsv4_index_k(ggml_context * ctx, int32_t il, llama_seq_id seq_id) const; + private: + llama_memory_hybrid_iswa * mem = nullptr; + // the index of the next ubatch to process size_t i_next = 0; diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index fd305cab79c2..529022ded18d 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -33,6 +33,7 @@ llama_memory_hybrid::llama_memory_hybrid( hparams(model.hparams), mem_attn(new llama_kv_cache( model, + model.hparams, type_k, type_v, v_trans, diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index c645d0785ab7..3343790070c4 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -392,6 +392,7 @@ namespace GGUFMeta { return get_arr(llm_kv(kid), result, required); } + template bool llama_model_loader::get_arr>(enum llm_kv kid, std::vector & result, bool required); template bool llama_model_loader::get_arr>(enum llm_kv kid, std::vector & result, bool required); template diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 8bf20a716eba..d35fb4db23a2 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -170,6 +170,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params return new llama_model_deepseek2(params); case LLM_ARCH_DEEPSEEK2OCR: return new llama_model_deepseek2ocr(params); + case LLM_ARCH_DEEPSEEK4: + return new llama_model_deepseek4(params); case LLM_ARCH_GLM_DSA: return new llama_model_glm_dsa(params); case LLM_ARCH_MISTRAL4: @@ -777,6 +779,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_310B_A15B: return "310B.A15B"; case LLM_TYPE_355B_A32B: return "355B.A32B"; case LLM_TYPE_397B_A17B: return "397B.A17B"; + case LLM_TYPE_685B_A37B: return "685B.A37B"; case LLM_TYPE_744B_A40B: return "744B.A40B"; case LLM_TYPE_E2B: return "E2B"; case LLM_TYPE_E4B: return "E4B"; @@ -1768,6 +1771,27 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func)); } + if (arch == LLM_ARCH_DEEPSEEK4) { + LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q); + LLAMA_LOG_INFO("%s: n_lora_o = %d\n", __func__, hparams.n_lora_o); + LLAMA_LOG_INFO("%s: n_attn_out_groups = %d\n", __func__, hparams.n_attn_out_groups); + LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); + LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); + LLAMA_LOG_INFO("%s: n_swa = %d\n", __func__, hparams.n_swa); + LLAMA_LOG_INFO("%s: compress_rope_freq_base = %.1f\n", __func__, hparams.compress_rope_freq_base); + LLAMA_LOG_INFO("%s: indexer_n_head = %d\n", __func__, hparams.indexer_n_head); + LLAMA_LOG_INFO("%s: indexer_head_size = %d\n", __func__, hparams.indexer_head_size); + LLAMA_LOG_INFO("%s: indexer_top_k = %d\n", __func__, hparams.indexer_top_k); + LLAMA_LOG_INFO("%s: n_hash_layers = %d\n", __func__, hparams.n_hash_layers); + LLAMA_LOG_INFO("%s: n_hc = %d\n", __func__, hparams.n_hc); + LLAMA_LOG_INFO("%s: hc_sinkhorn_iters = %d\n", __func__, hparams.hc_sinkhorn_iters); + LLAMA_LOG_INFO("%s: hc_eps = %.1e\n", __func__, hparams.hc_eps); + LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers); + LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); + LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm); + LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func)); + } + if (arch == LLM_ARCH_QWEN2MOE) { LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); @@ -1943,6 +1967,58 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, { res = nullptr; } break; + case LLM_ARCH_DEEPSEEK4: + { + llama_memory_i::layer_filter_cb filter_attn = [&](int32_t) { + return true; + }; + llama_memory_i::layer_filter_cb filter_recr = [&](int32_t il) { + return hparams.attn_compress_ratio[il] != 0; + }; + + // V4's standard SWA K cache, compressed-attention K cache + // (cache.attn_k), and indexer K cache (cache.index_k) all + // share the same `type_k` and must agree in dtype because + // src/models/deepseek4.cpp concatenates the SWA K view with + // the compressed K view via ggml_concat (which asserts + // a->type == b->type). Furthermore, V4's K activations are + // post-fp8-quantized (ggml_dsv4_fp8_kv_quantize), and q8_0's + // single fp16 scale per 32-element block cannot faithfully + // reproduce fp8-quantized value distributions -- pinning to + // q8_0 corrupts decode silently ("=" loops, "Mirror ..." + // garbage). Force fp16 unconditionally for V4 KV caches. + // + // NOTE: the user-facing WARN and the params.type_k/type_v + // coercion already happen earlier in llama_init_from_model + // (src/llama-context.cpp), BEFORE the shared + // SPLIT_MODE_TENSOR / V-quant-requires-FA validations run, + // so users requesting q8_0 KV with V4 don't trip those + // checks. The fp16 pin here is a defense-in-depth safety + // net for any direct callers of create_memory() that + // bypass llama_init_from_model. See + // docs/plans/v4-port-kv-q8-completion.md. + ggml_type v4_type_k = GGML_TYPE_F16; + ggml_type v4_type_v = GGML_TYPE_F16; + + res = new llama_memory_hybrid_iswa( + /* model */ *this, + /* attn_type_k */ v4_type_k, + /* attn_type_v */ v4_type_v, + /* attn_v_trans */ !cparams.flash_attn, + /* attn_swa_full */ params.swa_full, + /* attn_kv_size */ cparams.n_ctx_seq, + /* attn_n_ubatch */ cparams.n_ubatch, + /* attn_n_pad */ 1, + /* recurrent_type_r */ GGML_TYPE_F32, + /* recurrent_type_s */ GGML_TYPE_F32, + /* recurrent_rs_size */ std::max((uint32_t) 1, cparams.n_seq_max), + /* n_seq_max */ cparams.n_seq_max, + /* n_rs_seq */ cparams.n_rs_seq, + /* offload */ cparams.offload_kqv, + /* unified */ cparams.kv_unified, + /* filter_attn */ std::move(filter_attn), + /* filter_recr */ std::move(filter_recr)); + } break; // Models that need standard caching should rely on recurrent/hybrid // checks default: @@ -2069,6 +2145,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, res = new llama_kv_cache( *this, + hparams, params.type_k, params.type_v, !cparams.flash_attn, @@ -2258,6 +2335,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_DEEPSEEK: case LLM_ARCH_DEEPSEEK2: case LLM_ARCH_DEEPSEEK2OCR: + case LLM_ARCH_DEEPSEEK4: case LLM_ARCH_PLM: case LLM_ARCH_CHATGLM: case LLM_ARCH_GRANITE: diff --git a/src/llama-model.h b/src/llama-model.h index 01c87a75271f..fe1ea971a16c 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -137,6 +137,7 @@ enum llm_type { LLM_TYPE_310B_A15B, // /MiMo-V2-Flash LLM_TYPE_355B_A32B, // GLM-4.5 LLM_TYPE_397B_A17B, // Qwen3.5 + LLM_TYPE_685B_A37B, // DeepSeek V4-Flash LLM_TYPE_744B_A40B, // GLM-5 LLM_TYPE_E2B, LLM_TYPE_E4B, @@ -257,6 +258,15 @@ struct llama_layer { struct ggml_tensor * wv_enc = nullptr; struct ggml_tensor * wo_enc = nullptr; struct ggml_tensor * wqkv_gate = nullptr; + struct ggml_tensor * attn_kv = nullptr; + struct ggml_tensor * attn_wo_a = nullptr; + struct ggml_tensor * attn_wo_b = nullptr; + + // DeepSeek V4 KV compressors + struct ggml_tensor * attn_compressor_ape = nullptr; + struct ggml_tensor * attn_compressor_kv = nullptr; + struct ggml_tensor * attn_compressor_gate = nullptr; + struct ggml_tensor * attn_compressor_norm = nullptr; // relative position bias struct ggml_tensor * attn_rel_b = nullptr; @@ -322,6 +332,7 @@ struct llama_layer { struct ggml_tensor * ffn_up_b = nullptr; // b3 struct ggml_tensor * ffn_act = nullptr; struct ggml_tensor * ffn_exp_probs_b = nullptr; + struct ggml_tensor * ffn_gate_tid2eid = nullptr; // mamba proj struct ggml_tensor * ssm_in = nullptr; @@ -483,6 +494,18 @@ struct llama_layer { struct ggml_tensor * indexer_proj = nullptr; struct ggml_tensor * indexer_attn_k = nullptr; struct ggml_tensor * indexer_attn_q_b = nullptr; // note: for lora a/b, not bias + struct ggml_tensor * indexer_compressor_ape = nullptr; + struct ggml_tensor * indexer_compressor_kv = nullptr; + struct ggml_tensor * indexer_compressor_gate = nullptr; + struct ggml_tensor * indexer_compressor_norm = nullptr; + + // DeepSeek V4 hyper-connection weights + struct ggml_tensor * hc_attn_base = nullptr; + struct ggml_tensor * hc_attn_fn = nullptr; + struct ggml_tensor * hc_attn_scale = nullptr; + struct ggml_tensor * hc_ffn_base = nullptr; + struct ggml_tensor * hc_ffn_fn = nullptr; + struct ggml_tensor * hc_ffn_scale = nullptr; // gemma4 layer output scale struct ggml_tensor * out_scale = nullptr; @@ -531,6 +554,9 @@ struct llama_model { struct ggml_tensor * output_norm_b = nullptr; struct ggml_tensor * output = nullptr; struct ggml_tensor * output_b = nullptr; + struct ggml_tensor * output_hc_base = nullptr; + struct ggml_tensor * output_hc_fn = nullptr; + struct ggml_tensor * output_hc_scale = nullptr; struct ggml_tensor * output_norm_enc = nullptr; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 43e05c3d56fe..6914df768c1e 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -292,6 +292,14 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param // quantize only 2D and 3D tensors (experts) if (ggml_n_dims(tensor) < 2) return false; + // do not quantize integer tensors (e.g. DeepSeek V4 ffn_gate_tid2eid which + // stores expert-id indices as I32). Quantization makes no sense for non + // floating-point data; the dequantize path also explicitly rejects them. + if (tensor->type == GGML_TYPE_I8 || + tensor->type == GGML_TYPE_I16 || + tensor->type == GGML_TYPE_I32 || + tensor->type == GGML_TYPE_I64) return false; + const std::string name = ggml_get_name(tensor); // This used to be a regex, but has an extreme cost to compile times. diff --git a/src/models/deepseek4.cpp b/src/models/deepseek4.cpp new file mode 100644 index 000000000000..71d803014fd6 --- /dev/null +++ b/src/models/deepseek4.cpp @@ -0,0 +1,1583 @@ +#include "models.h" + +#include "ggml-backend.h" +#include "llama-kv-cache-iswa.h" +#include "llama-memory-hybrid-iswa.h" +#include "llama-memory-recurrent.h" + +#include +#include +#include +#include +#include +#include + +void llama_model_deepseek4::load_arch_hparams(llama_model_loader & ml) { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q); + ml.get_key(LLM_KV_ATTENTION_OUTPUT_LORA_RANK, hparams.n_lora_o); + ml.get_key(LLM_KV_ATTENTION_OUTPUT_GROUP_COUNT,hparams.n_attn_out_groups); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); + if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) { + hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SQRTSOFTPLUS; + } + + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); + if (hparams.n_swa > 0) { + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + hparams.set_swa_pattern(0, false); + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; + } + ml.get_key(LLM_KV_ATTENTION_COMPRESS_ROPE_FREQ_BASE, hparams.compress_rope_freq_base, false); + ml.get_key(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head, false); + ml.get_key(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size, false); + ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k, false); + ml.get_key(LLM_KV_HASH_LAYER_COUNT, hparams.n_hash_layers); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); + ml.get_key(LLM_KV_HYPER_CONNECTION_COUNT, hparams.n_hc); + ml.get_key(LLM_KV_HYPER_CONNECTION_SINKHORN_ITERS, hparams.hc_sinkhorn_iters); + ml.get_key(LLM_KV_HYPER_CONNECTION_EPS, hparams.hc_eps); + ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer, false); + + std::vector compress_ratios; + ml.get_arr(LLM_KV_ATTENTION_COMPRESS_RATIOS, compress_ratios); + if (compress_ratios.size() < hparams.n_layer) { + throw std::runtime_error(format("DeepSeek V4 compress ratio count mismatch: got %zu, expected %u", + compress_ratios.size(), hparams.n_layer)); + } + std::copy_n(compress_ratios.begin(), hparams.n_layer, hparams.attn_compress_ratio.begin()); + + for (uint32_t il = 0; il < hparams.n_layer; ++il) { + const uint32_t ratio = hparams.attn_compress_ratio[il]; + if (ratio == 0) { + continue; + } + + const uint32_t coff = ratio == 4 ? 2 : 1; + uint32_t state_size = coff * ratio * coff * hparams.n_embd_head_k(il); + if (ratio == 4) { + state_size += coff * ratio * coff * hparams.indexer_head_size; + } + hparams.dsv4_state_size = std::max(hparams.dsv4_state_size, state_size); + } + + type = LLM_TYPE_UNKNOWN; +} + +void llama_model_deepseek4::load_arch_tensors(llama_model_loader &) { + LLAMA_LOAD_LOCALS; + + const int64_t q_lora_rank = hparams.n_lora_q; + const int64_t o_lora_rank = hparams.n_lora_o; + const int64_t n_out_groups = hparams.n_attn_out_groups; + const int64_t n_ff_exp = hparams.n_ff_exp; + const int64_t n_expert_shared = hparams.n_expert_shared; + const int64_t n_hc = hparams.n_hc; + const int64_t hc_dim = n_hc * n_embd; + const int64_t hc_mix = (2 + n_hc) * n_hc; + + if (n_out_groups == 0) { + throw std::runtime_error("DeepSeek V4 requires attention output groups"); + } + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + output_hc_base = create_tensor(tn(LLM_TENSOR_OUTPUT_HC_BASE, "weight"), {n_hc}, 0); + output_hc_fn = create_tensor(tn(LLM_TENSOR_OUTPUT_HC_FN, "weight"), {hc_dim, n_hc}, 0); + output_hc_scale = create_tensor(tn(LLM_TENSOR_OUTPUT_HC_SCALE, "weight"), {1}, 0); + + auto create_deepseek4_compressor = [&](llama_layer & layer, int bid, int64_t compress_ratio, int64_t head_size, bool indexer) { + const int64_t coff = compress_ratio == 4 ? 2 : 1; + ggml_tensor *& ape = indexer ? layer.indexer_compressor_ape : layer.attn_compressor_ape; + ggml_tensor *& kv = indexer ? layer.indexer_compressor_kv : layer.attn_compressor_kv; + ggml_tensor *& gate = indexer ? layer.indexer_compressor_gate : layer.attn_compressor_gate; + ggml_tensor *& norm = indexer ? layer.indexer_compressor_norm : layer.attn_compressor_norm; + + ape = create_tensor(tn(indexer ? LLM_TENSOR_INDEXER_COMPRESSOR_APE : LLM_TENSOR_ATTN_COMPRESSOR_APE, "weight", bid), {coff * head_size, compress_ratio}, 0); + kv = create_tensor(tn(indexer ? LLM_TENSOR_INDEXER_COMPRESSOR_KV : LLM_TENSOR_ATTN_COMPRESSOR_KV, "weight", bid), {n_embd, coff * head_size}, 0); + gate = create_tensor(tn(indexer ? LLM_TENSOR_INDEXER_COMPRESSOR_GATE : LLM_TENSOR_ATTN_COMPRESSOR_GATE, "weight", bid), {n_embd, coff * head_size}, 0); + norm = create_tensor(tn(indexer ? LLM_TENSOR_INDEXER_COMPRESSOR_NORM : LLM_TENSOR_ATTN_COMPRESSOR_NORM, "weight", bid), {head_size}, 0); + }; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + const int64_t compress_ratio = hparams.attn_compress_ratio[i]; + + layer.hc_attn_base = create_tensor(tn(LLM_TENSOR_HC_ATTN_BASE, "weight", i), {hc_mix}, 0); + layer.hc_attn_fn = create_tensor(tn(LLM_TENSOR_HC_ATTN_FN, "weight", i), {hc_dim, hc_mix}, 0); + layer.hc_attn_scale = create_tensor(tn(LLM_TENSOR_HC_ATTN_SCALE, "weight", i), {3}, 0); + layer.hc_ffn_base = create_tensor(tn(LLM_TENSOR_HC_FFN_BASE, "weight", i), {hc_mix}, 0); + layer.hc_ffn_fn = create_tensor(tn(LLM_TENSOR_HC_FFN_FN, "weight", i), {hc_dim, hc_mix}, 0); + layer.hc_ffn_scale = create_tensor(tn(LLM_TENSOR_HC_FFN_SCALE, "weight", i), {3}, 0); + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0); + layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0); + layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {n_embd_head_k}, 0); + + layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0); + layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0); + layer.attn_kv = create_tensor(tn(LLM_TENSOR_ATTN_KV, "weight", i), {n_embd, n_embd_head_k}, 0); + layer.attn_wo_a = create_tensor(tn(LLM_TENSOR_ATTN_OUT_A, "weight", i), {n_head * n_embd_head_v / n_out_groups, n_out_groups * o_lora_rank}, 0); + layer.attn_wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_B, "weight", i), {n_out_groups * o_lora_rank, n_embd}, 0); + + if (compress_ratio > 0) { + create_deepseek4_compressor(layer, i, compress_ratio, n_embd_head_k, false); + } + if (compress_ratio == 4) { + layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, 0); + layer.indexer_proj = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, 0); + create_deepseek4_compressor(layer, i, compress_ratio, hparams.indexer_head_size, true); + } + + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); + if (static_cast(i) < hparams.n_hash_layers) { + layer.ffn_gate_tid2eid = create_tensor(tn(LLM_TENSOR_FFN_GATE_TID2EID, "weight", i), {n_expert_used, n_vocab}, 0); + layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED); + } else { + layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0); + layer.ffn_gate_tid2eid = create_tensor(tn(LLM_TENSOR_FFN_GATE_TID2EID, "weight", i), {n_expert_used, n_vocab}, TENSOR_NOT_REQUIRED); + } + + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0); + + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_exp * n_expert_shared, n_embd}, 0); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0); + } +} + +std::unique_ptr llama_model_deepseek4::build_arch_graph(const llm_graph_params & params) const { + return std::make_unique(*this, params); +} + +namespace { + +struct dsv4_hc_mix { + ggml_tensor * x; + ggml_tensor * mixes; + ggml_tensor * pre; + ggml_tensor * post; + ggml_tensor * comb; +}; + +struct dsv4_state_pair { + ggml_tensor * kv; + ggml_tensor * score; +}; + +struct dsv4_decode_compressor { + ggml_tensor * kv_state; + ggml_tensor * score_state; + ggml_tensor * kv_comp; +}; + +struct dsv4_state_layout { + int64_t width; + int64_t rows; + int64_t elems; +}; + +enum class dsv4_mask_kind { + RAW_WINDOW, + COMPRESS_CAUSAL, + ATTN_STATIC, +}; + +struct dsv4_mask_entry { + ggml_tensor * tensor = nullptr; + dsv4_mask_kind kind; + int64_t n_raw = 0; + int64_t n_comp = 0; + int64_t window = 0; + int64_t ratio = 0; +}; + +class dsv4_graph_inputs : public llm_graph_input_i { +public: + ggml_tensor * add_mask( + ggml_context * ctx, + dsv4_mask_kind kind, + int64_t n0, + int64_t n1, + int64_t n_raw, + int64_t n_comp, + int64_t window, + int64_t ratio, + const char * name) { + ggml_tensor * t = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n0, n1, 1, 1); + ggml_set_input(t); + ggml_set_name(t, name); + masks.push_back({ t, kind, n_raw, n_comp, window, ratio }); + return t; + } + + void set_input(const llama_ubatch * ubatch) override { + for (const auto & mask : masks) { + GGML_ASSERT(mask.tensor != nullptr); + if (mask.tensor->buffer == nullptr) { + continue; + } + + const int64_t n0 = mask.tensor->ne[0]; + const int64_t n1 = mask.tensor->ne[1]; + + std::vector data(n0*n1, -INFINITY); + + switch (mask.kind) { + case dsv4_mask_kind::RAW_WINDOW: + fill_raw_window(data, n0, n1, mask.window, ubatch); + break; + case dsv4_mask_kind::COMPRESS_CAUSAL: + fill_compress_causal(data, n0, n1, mask.ratio, 0, ubatch); + break; + case dsv4_mask_kind::ATTN_STATIC: + fill_raw_window(data, n0, n1, mask.window, ubatch); + fill_compress_causal(data, n0, n1, mask.ratio, mask.n_raw, ubatch); + break; + } + + ggml_backend_tensor_set(mask.tensor, data.data(), 0, data.size()*sizeof(float)); + } + } + +private: + static void fill_raw_window( + std::vector & data, + int64_t n0, + int64_t n1, + int64_t window, + const llama_ubatch * ubatch) { + GGML_ASSERT((int64_t) ubatch->n_tokens == n1); + + for (int64_t iq = 0; iq < n1; ++iq) { + const llama_pos p1 = ubatch->pos ? ubatch->pos[iq] : (llama_pos) iq; + + for (int64_t ik = 0; ik < std::min(n0, ubatch->n_tokens); ++ik) { + const llama_pos p0 = ubatch->pos ? ubatch->pos[ik] : (llama_pos) ik; + + if (p0 > p1) { + continue; + } + + if (window > 0 && p1 - p0 >= window) { + continue; + } + + data[iq*n0 + ik] = 0.0f; + } + } + } + + static void fill_compress_causal( + std::vector & data, + int64_t n0, + int64_t n1, + int64_t ratio, + int64_t offset, + const llama_ubatch * ubatch) { + GGML_ASSERT(ratio > 0); + + const int64_t n_comp = n0 - offset; + for (int64_t iq = 0; iq < n1; ++iq) { + const llama_pos p1 = ubatch->pos ? ubatch->pos[iq] : (llama_pos) iq; + const int64_t n_visible = (p1 + 1) / ratio; + + for (int64_t ic = 0; ic < std::min(n_comp, n_visible); ++ic) { + data[iq*n0 + offset + ic] = 0.0f; + } + } + } + + std::vector masks; +}; + +struct dsv4_rope_cfg { + int32_t n_ctx_orig; + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; +}; + +static ggml_tensor * dsv4_view_scale(ggml_context * ctx, ggml_tensor * scale, int64_t idx) { + return ggml_view_2d(ctx, scale, 1, 1, scale->nb[0], idx * scale->nb[0]); +} + +static ggml_tensor * dsv4_add_scalar(ggml_context * ctx, ggml_tensor * x, float value) { + ggml_tensor * shape = x; + x = ggml_cont(ctx, x); + x = ggml_reshape_1d(ctx, x, ggml_nelements(x)); + x = ggml_scale_bias(ctx, x, 1.0f, value); + return ggml_reshape(ctx, x, shape); +} + +static ggml_tensor * dsv4_mul_scalar(ggml_context * ctx, ggml_tensor * x, float value) { + ggml_tensor * shape = x; + x = ggml_cont(ctx, x); + x = ggml_reshape_1d(ctx, x, ggml_nelements(x)); + x = ggml_scale(ctx, x, value); + return ggml_reshape(ctx, x, shape); +} + +static ggml_tensor * dsv4_arange_i32(ggml_context * ctx, int64_t begin, int64_t end) { + ggml_tensor * t = ggml_arange(ctx, (float) begin, (float) end, 1.0f); + return ggml_cast(ctx, t, GGML_TYPE_I32); +} + +static ggml_tensor * dsv4_new_filled_2d(ggml_context * ctx, int64_t n0, int64_t n1, float value) { + return ggml_fill(ctx, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n0, n1), value); +} + +static ggml_tensor * dsv4_new_filled_3d(ggml_context * ctx, int64_t n0, int64_t n1, int64_t n2, float value) { + return ggml_fill(ctx, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n0, n1, n2), value); +} + +static dsv4_state_layout dsv4_make_state_layout(int64_t compress_ratio, int64_t head_dim) { + const int64_t coff = compress_ratio == 4 ? 2 : 1; + const int64_t width = coff * head_dim; + const int64_t rows = coff * compress_ratio; + return { width, rows, width * rows }; +} + +static ggml_tensor * dsv4_view_cols( + ggml_context * ctx, + ggml_tensor * x, + int64_t n0, + int64_t n1, + int64_t off0, + int64_t off1) { + return ggml_view_2d(ctx, x, n0, n1, x->nb[1], off1*x->nb[1] + off0*x->nb[0]); +} + +static ggml_tensor * dsv4_view_state_segment( + ggml_context * ctx, + ggml_tensor * state, + int64_t offset, + int64_t width, + int64_t rows) { + return ggml_view_2d(ctx, state, width, rows, width*state->nb[0], offset*state->nb[0]); +} + +static void dsv4_store_state_segment( + ggml_context * ctx, + ggml_cgraph * gf, + ggml_tensor * src, + ggml_tensor * dst, + int64_t state_size, + int64_t head, + int64_t offset) { + const int64_t n = ggml_nelements(src); + src = ggml_cont(ctx, src); + src = ggml_reshape_1d(ctx, src, n); + + ggml_tensor * view = ggml_view_1d(ctx, dst, n, (head*state_size + offset)*ggml_element_size(dst)); + ggml_build_forward_expand(gf, ggml_cpy(ctx, src, view)); +} + +static void dsv4_store_cache_rows( + ggml_context * ctx, + ggml_cgraph * gf, + ggml_tensor * cache, + ggml_tensor * src, + int64_t row_start, + int64_t n_rows) { + if (n_rows <= 0) { + return; + } + + src = ggml_cont(ctx, src); + src = ggml_reshape_2d(ctx, src, cache->ne[0], n_rows); + + // Avoid ggml_set_rows here: on multi-GPU, sched routes set_rows by SOURCE + // device, but the cache destination has its own device affinity → illegal + // memory access when those differ. ggml_cpy into a contiguous view of + // cache routes correctly by dst affinity (same pattern as + // dsv4_store_state_segment, which works in production multi-GPU). + ggml_tensor * cache_view = ggml_view_2d(ctx, cache, + cache->ne[0], n_rows, + cache->nb[1], + row_start * cache->nb[1]); + ggml_build_forward_expand(gf, ggml_cpy(ctx, src, cache_view)); +} + +static dsv4_rope_cfg dsv4_make_rope_cfg( + const llama_hparams & hparams, + const llama_cparams & cparams, + uint32_t compress_ratio) { + if (compress_ratio == 0) { + return { + 0, + hparams.rope_freq_base_train, + 1.0f, + 0.0f, + 1.0f, + cparams.yarn_beta_fast, + cparams.yarn_beta_slow, + }; + } + + float attn_factor = 1.0f; + if (cparams.yarn_ext_factor != 0.0f && cparams.rope_freq_scale > 0.0f) { + // DeepSeek V4 uses YaRN-style frequency interpolation for compressed RoPE, + // but the reference implementation does not apply YaRN's magnitude scale. + attn_factor /= 1.0f + 0.1f * std::log(1.0f / cparams.rope_freq_scale); + } + + return { + (int32_t) cparams.n_ctx_orig_yarn, + hparams.compress_rope_freq_base > 0.0f ? hparams.compress_rope_freq_base : cparams.rope_freq_base, + cparams.rope_freq_scale, + cparams.yarn_ext_factor, + attn_factor, + cparams.yarn_beta_fast, + cparams.yarn_beta_slow, + }; +} + +static ggml_tensor * dsv4_view_base(ggml_context * ctx, ggml_tensor * base, int64_t n, int64_t off) { + return ggml_view_2d(ctx, base, n, 1, base->nb[0], off * base->nb[0]); +} + +static ggml_tensor * dsv4_apply_rope_tail( + ggml_context * ctx, + ggml_tensor * x, + ggml_tensor * inp_pos, + int64_t n_embd_head, + int64_t n_head, + int64_t n_tokens, + int64_t n_rot, + int rope_type, + int32_t n_ctx_orig, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow, + bool inverse) { + GGML_ASSERT(x->ne[0] == n_embd_head); + GGML_ASSERT(x->ne[1] == n_head); + GGML_ASSERT(x->ne[2] == n_tokens); + + if (n_rot == n_embd_head) { + return inverse + ? ggml_rope_ext_back(ctx, x, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow) + : ggml_rope_ext (ctx, x, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + } + + const int64_t n_nope = n_embd_head - n_rot; + GGML_ASSERT(n_nope > 0); + + return ggml_dsv4_rope_tail(ctx, x, inp_pos, nullptr, n_rot, rope_type, + n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, + beta_fast, beta_slow, inverse); +} + +static dsv4_hc_mix dsv4_hc_pre( + ggml_context * ctx, + ggml_tensor * x, + ggml_tensor * hc_fn, + ggml_tensor * hc_scale, + ggml_tensor * hc_base, + int64_t n_embd, + int64_t n_hc, + int64_t n_tokens, + float norm_eps, + int sinkhorn_iters, + float hc_eps) { + const int64_t hc_dim = n_embd * n_hc; + ggml_tensor * flat = ggml_cont(ctx, ggml_reshape_2d(ctx, x, hc_dim, n_tokens)); + flat = ggml_rms_norm(ctx, flat, norm_eps); + ggml_tensor * mixes = ggml_mul_mat(ctx, hc_fn, flat); // [mix_hc, n_tokens] + ggml_tensor * split = ggml_dsv4_hc_split_sinkhorn(ctx, mixes, hc_scale, hc_base, n_hc, sinkhorn_iters, hc_eps); + ggml_tensor * pre = ggml_view_2d(ctx, split, n_hc, n_tokens, split->nb[1], 0); + ggml_tensor * post = ggml_view_2d(ctx, split, n_hc, n_tokens, split->nb[1], n_hc * split->nb[0]); + ggml_tensor * comb = ggml_view_2d(ctx, split, n_hc * n_hc, n_tokens, split->nb[1], 2 * n_hc * split->nb[0]); + if (n_tokens != 1) { + pre = ggml_cont(ctx, pre); + post = ggml_cont(ctx, post); + comb = ggml_cont(ctx, comb); + } + comb = ggml_reshape_3d(ctx, comb, n_hc, n_hc, n_tokens); // [src_hc, dst_hc, n_tokens] + ggml_tensor * y = ggml_dsv4_hc_weighted_sum(ctx, x, pre); + return { y, mixes, pre, post, comb }; +} + +static ggml_tensor * dsv4_hc_post( + ggml_context * ctx, + ggml_tensor * x, + ggml_tensor * residual, + ggml_tensor * post, + ggml_tensor * comb, + int64_t n_embd, + int64_t n_hc, + int64_t n_tokens) { + GGML_ASSERT(x->ne[0] == n_embd); + GGML_ASSERT(x->ne[1] == n_tokens); + GGML_ASSERT(residual->ne[0] == n_embd); + GGML_ASSERT(residual->ne[1] == n_hc); + GGML_ASSERT(residual->ne[2] == n_tokens); + GGML_ASSERT(post->ne[0] == n_hc); + GGML_ASSERT(post->ne[1] == n_tokens); + GGML_ASSERT(comb->ne[0] == n_hc); + GGML_ASSERT(comb->ne[1] == n_hc); + GGML_ASSERT(comb->ne[2] == n_tokens); + + return ggml_dsv4_hc_expand(ctx, x, residual, post, comb); +} + +static ggml_tensor * dsv4_hc_head( + ggml_context * ctx, + ggml_tensor * x, + ggml_tensor * hc_fn, + ggml_tensor * hc_scale, + ggml_tensor * hc_base, + int64_t n_embd, + int64_t n_hc, + int64_t n_tokens, + float norm_eps, + float hc_eps) { + const int64_t hc_dim = n_embd * n_hc; + + ggml_tensor * flat = ggml_cont(ctx, ggml_reshape_2d(ctx, x, hc_dim, n_tokens)); + flat = ggml_rms_norm(ctx, flat, norm_eps); + + ggml_tensor * pre = ggml_mul_mat(ctx, hc_fn, flat); // [hc, n_tokens] + pre = ggml_mul(ctx, pre, dsv4_view_scale(ctx, hc_scale, 0)); + pre = ggml_add(ctx, pre, dsv4_view_base(ctx, hc_base, n_hc, 0)); + pre = dsv4_add_scalar(ctx, ggml_sigmoid(ctx, pre), hc_eps); + + return ggml_dsv4_hc_weighted_sum(ctx, x, pre); +} + +static ggml_tensor * dsv4_grouped_out( + ggml_context * ctx, + ggml_tensor * o, + ggml_tensor * wo_a, + ggml_tensor * wo_b, + int64_t n_embd_head, + int64_t n_head, + int64_t n_groups, + int64_t o_lora_rank, + int64_t n_tokens) { + GGML_ASSERT(n_head % n_groups == 0); + + const int64_t group_heads = n_head / n_groups; + const int64_t group_dim = n_embd_head * group_heads; + + o = ggml_cont(ctx, o); + o = ggml_reshape_3d(ctx, o, group_dim, n_groups, n_tokens); + + ggml_tensor * wo_a_g = ggml_reshape_3d(ctx, wo_a, group_dim, o_lora_rank, n_groups); + ggml_tensor * ids = ggml_arange(ctx, 0.0f, float(n_groups), 1.0f); + ids = ggml_cast(ctx, ids, GGML_TYPE_I32); + ids = ggml_repeat_4d(ctx, ids, n_groups, n_tokens, 1, 1); + + ggml_tensor * low = ggml_mul_mat_id(ctx, wo_a_g, o, ids); // [o_lora_rank, n_groups, n_tokens] + low = ggml_reshape_2d(ctx, low, o_lora_rank * n_groups, n_tokens); + + return ggml_mul_mat(ctx, wo_b, low); +} + +static ggml_tensor * dsv4_softmax_pool_ratio( + ggml_context * ctx, + ggml_tensor * kv, + ggml_tensor * score) { + score = ggml_soft_max(ctx, score); + ggml_tensor * pooled = ggml_mul(ctx, kv, score); + pooled = ggml_sum_rows(ctx, pooled); + return ggml_reshape_2d(ctx, pooled, kv->ne[1], kv->ne[2]); +} + +static ggml_tensor * dsv4_shift_overlap_state( + ggml_context * ctx, + ggml_tensor * x, + float pad_value) { + const int64_t n_embd = x->ne[0]; + const int64_t ratio = x->ne[1]; + const int64_t n_comp = x->ne[2]; + + ggml_tensor * first = ggml_view_3d(ctx, x, n_embd, ratio, 1, + x->nb[1], x->nb[2], 0); + ggml_tensor * pad = ggml_fill(ctx, ggml_cont(ctx, first), pad_value); + + if (n_comp == 1) { + return pad; + } + + ggml_tensor * prev = ggml_view_3d(ctx, x, n_embd, ratio, n_comp - 1, + x->nb[1], x->nb[2], 0); + return ggml_concat(ctx, pad, prev, 2); +} + +static ggml_tensor * dsv4_build_compressor_prefill( + ggml_context * ctx, + ggml_tensor * x, + ggml_tensor * wkv, + ggml_tensor * wgate, + ggml_tensor * ape, + ggml_tensor * norm, + ggml_tensor * pos, + int64_t n_embd_head, + int64_t n_rot, + int64_t n_tokens, + int64_t compress_ratio, + int rope_type, + const dsv4_rope_cfg & rope_cfg, + float norm_eps) { + GGML_ASSERT(compress_ratio > 0); + const int64_t n_comp = n_tokens / compress_ratio; + GGML_ASSERT(n_comp > 0); + + const int64_t coff = compress_ratio == 4 ? 2 : 1; + const int64_t n_kv = coff * n_embd_head; + const int64_t cutoff = n_comp * compress_ratio; + + ggml_tensor * kv = ggml_mul_mat(ctx, wkv, x); // [coff*head_dim, n_tokens] + ggml_tensor * score = ggml_mul_mat(ctx, wgate, x); // [coff*head_dim, n_tokens] + + kv = ggml_view_3d(ctx, kv, n_kv, compress_ratio, n_comp, + kv->nb[1], + kv->nb[1] * compress_ratio, + 0); + score = ggml_view_3d(ctx, score, n_kv, compress_ratio, n_comp, + score->nb[1], + score->nb[1] * compress_ratio, + 0); + GGML_ASSERT(cutoff <= n_tokens); + + ggml_tensor * ape_f = ape->type == GGML_TYPE_F32 ? ape : ggml_cast(ctx, ape, GGML_TYPE_F32); + score = ggml_add(ctx, score, ggml_repeat(ctx, ape_f, score)); + + if (coff == 1) { + kv = ggml_cont(ctx, ggml_permute(ctx, kv, 1, 0, 2, 3)); // [ratio, head_dim, n_comp] + score = ggml_cont(ctx, ggml_permute(ctx, score, 1, 0, 2, 3)); // [ratio, head_dim, n_comp] + kv = dsv4_softmax_pool_ratio(ctx, kv, score); // [head_dim, n_comp] + } else { + ggml_tensor * kv_prev = ggml_view_3d(ctx, kv, n_embd_head, compress_ratio, n_comp, + kv->nb[1], kv->nb[2], 0); + ggml_tensor * kv_curr = ggml_view_3d(ctx, kv, n_embd_head, compress_ratio, n_comp, + kv->nb[1], kv->nb[2], n_embd_head * kv->nb[0]); + ggml_tensor * score_prev = ggml_view_3d(ctx, score, n_embd_head, compress_ratio, n_comp, + score->nb[1], score->nb[2], 0); + ggml_tensor * score_curr = ggml_view_3d(ctx, score, n_embd_head, compress_ratio, n_comp, + score->nb[1], score->nb[2], n_embd_head * score->nb[0]); + + kv_prev = dsv4_shift_overlap_state(ctx, kv_prev, 0.0f); + score_prev = dsv4_shift_overlap_state(ctx, score_prev, -INFINITY); + + kv_prev = ggml_cont(ctx, ggml_permute(ctx, kv_prev, 1, 0, 2, 3)); // [ratio, head_dim, n_comp] + kv_curr = ggml_cont(ctx, ggml_permute(ctx, kv_curr, 1, 0, 2, 3)); + score_prev = ggml_cont(ctx, ggml_permute(ctx, score_prev, 1, 0, 2, 3)); + score_curr = ggml_cont(ctx, ggml_permute(ctx, score_curr, 1, 0, 2, 3)); + + kv = ggml_concat(ctx, kv_prev, kv_curr, 0); // [2*ratio, head_dim, n_comp] + score = ggml_concat(ctx, score_prev, score_curr, 0); + kv = dsv4_softmax_pool_ratio(ctx, kv, score); // [head_dim, n_comp] + } + + kv = ggml_rms_norm(ctx, kv, norm_eps); + kv = ggml_mul(ctx, kv, norm); + kv = ggml_reshape_3d(ctx, kv, n_embd_head, 1, n_comp); + + kv = dsv4_apply_rope_tail(ctx, kv, pos, + n_embd_head, 1, n_comp, n_rot, rope_type, + rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale, + rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, false); + + return kv; +} + +static dsv4_state_pair dsv4_build_compressor_prefill_state( + ggml_context * ctx, + ggml_tensor * x, + ggml_tensor * wkv, + ggml_tensor * wgate, + ggml_tensor * ape, + int64_t head_dim, + int64_t n_tokens, + int64_t compress_ratio) { + const dsv4_state_layout layout = dsv4_make_state_layout(compress_ratio, head_dim); + + const int64_t cutoff = (n_tokens / compress_ratio) * compress_ratio; + const int64_t remainder = n_tokens - cutoff; + + ggml_tensor * kv = ggml_mul_mat(ctx, wkv, x); // [width, n_tokens] + ggml_tensor * score = ggml_mul_mat(ctx, wgate, x); + ggml_tensor * ape_f = ape->type == GGML_TYPE_F32 ? ape : ggml_cast(ctx, ape, GGML_TYPE_F32); + + if (compress_ratio == 4) { + ggml_tensor * kv_prev = dsv4_new_filled_2d(ctx, layout.width, compress_ratio, 0.0f); + ggml_tensor * score_prev = dsv4_new_filled_2d(ctx, layout.width, compress_ratio, -INFINITY); + + if (cutoff >= compress_ratio) { + kv_prev = ggml_view_2d(ctx, kv, layout.width, compress_ratio, kv->nb[1], (cutoff - compress_ratio)*kv->nb[1]); + score_prev = ggml_view_2d(ctx, score, layout.width, compress_ratio, score->nb[1], (cutoff - compress_ratio)*score->nb[1]); + score_prev = ggml_add(ctx, score_prev, ape_f); + } + + ggml_tensor * kv_curr = dsv4_new_filled_2d(ctx, layout.width, compress_ratio, 0.0f); + ggml_tensor * score_curr = dsv4_new_filled_2d(ctx, layout.width, compress_ratio, -INFINITY); + + if (remainder > 0) { + ggml_tensor * kv_rem = ggml_view_2d(ctx, kv, layout.width, remainder, kv->nb[1], cutoff*kv->nb[1]); + ggml_tensor * sc_rem = ggml_view_2d(ctx, score, layout.width, remainder, score->nb[1], cutoff*score->nb[1]); + sc_rem = ggml_add(ctx, sc_rem, ggml_view_2d(ctx, ape_f, layout.width, remainder, ape_f->nb[1], 0)); + + if (remainder == compress_ratio) { + kv_curr = kv_rem; + score_curr = sc_rem; + } else { + kv_curr = ggml_concat(ctx, kv_rem, + dsv4_new_filled_2d(ctx, layout.width, compress_ratio - remainder, 0.0f), 1); + score_curr = ggml_concat(ctx, sc_rem, + dsv4_new_filled_2d(ctx, layout.width, compress_ratio - remainder, -INFINITY), 1); + } + } + + return { + ggml_concat(ctx, kv_prev, kv_curr, 1), + ggml_concat(ctx, score_prev, score_curr, 1), + }; + } + + ggml_tensor * kv_state = dsv4_new_filled_2d(ctx, layout.width, compress_ratio, 0.0f); + ggml_tensor * score_state = dsv4_new_filled_2d(ctx, layout.width, compress_ratio, -INFINITY); + + if (remainder > 0) { + ggml_tensor * kv_rem = ggml_view_2d(ctx, kv, layout.width, remainder, kv->nb[1], cutoff*kv->nb[1]); + ggml_tensor * sc_rem = ggml_view_2d(ctx, score, layout.width, remainder, score->nb[1], cutoff*score->nb[1]); + sc_rem = ggml_add(ctx, sc_rem, ggml_view_2d(ctx, ape_f, layout.width, remainder, ape_f->nb[1], 0)); + + if (remainder == compress_ratio) { + kv_state = kv_rem; + score_state = sc_rem; + } else { + kv_state = ggml_concat(ctx, kv_rem, + dsv4_new_filled_2d(ctx, layout.width, compress_ratio - remainder, 0.0f), 1); + score_state = ggml_concat(ctx, sc_rem, + dsv4_new_filled_2d(ctx, layout.width, compress_ratio - remainder, -INFINITY), 1); + } + } + + return { kv_state, score_state }; +} + +static ggml_tensor * dsv4_pool_decode_state( + ggml_context * ctx, + ggml_tensor * kv, + ggml_tensor * score, + ggml_tensor * norm, + ggml_tensor * pos, + int64_t head_dim, + int64_t n_rot, + int rope_type, + const dsv4_rope_cfg & rope_cfg, + float norm_eps) { + const int64_t n_rows = kv->ne[1]; + kv = ggml_reshape_3d(ctx, ggml_cont(ctx, ggml_transpose(ctx, kv)), n_rows, head_dim, 1); + score = ggml_reshape_3d(ctx, ggml_cont(ctx, ggml_transpose(ctx, score)), n_rows, head_dim, 1); + + ggml_tensor * pooled = dsv4_softmax_pool_ratio(ctx, kv, score); + pooled = ggml_rms_norm(ctx, pooled, norm_eps); + pooled = ggml_mul(ctx, pooled, norm); + pooled = ggml_reshape_3d(ctx, pooled, head_dim, 1, 1); + + return dsv4_apply_rope_tail(ctx, pooled, pos, + head_dim, 1, 1, n_rot, rope_type, + rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale, + rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, false); +} + +static dsv4_decode_compressor dsv4_build_compressor_decode_projected( + ggml_context * ctx, + ggml_tensor * kv_cur, + ggml_tensor * sc_cur, + ggml_tensor * prev_kv_state, + ggml_tensor * prev_score_state, + ggml_tensor * norm, + int64_t head_dim, + int64_t n_rot, + int64_t pos, + int64_t compress_ratio, + int rope_type, + const dsv4_rope_cfg & rope_cfg, + float norm_eps); + +static dsv4_decode_compressor dsv4_build_compressor_decode( + ggml_context * ctx, + ggml_tensor * x, + ggml_tensor * prev_kv_state, + ggml_tensor * prev_score_state, + ggml_tensor * wkv, + ggml_tensor * wgate, + ggml_tensor * ape, + ggml_tensor * norm, + int64_t head_dim, + int64_t n_rot, + int64_t pos, + int64_t compress_ratio, + int rope_type, + const dsv4_rope_cfg & rope_cfg, + float norm_eps) { + const dsv4_state_layout layout = dsv4_make_state_layout(compress_ratio, head_dim); + const int64_t pos_mod = pos % compress_ratio; + + ggml_tensor * kv_cur = ggml_mul_mat(ctx, wkv, x); // [width, 1] + ggml_tensor * sc_cur = ggml_mul_mat(ctx, wgate, x); + ggml_tensor * ape_f = ape->type == GGML_TYPE_F32 ? ape : ggml_cast(ctx, ape, GGML_TYPE_F32); + sc_cur = ggml_add(ctx, sc_cur, ggml_view_2d(ctx, ape_f, layout.width, 1, ape_f->nb[1], pos_mod*ape_f->nb[1])); + + return dsv4_build_compressor_decode_projected(ctx, + kv_cur, sc_cur, + prev_kv_state, prev_score_state, + norm, + head_dim, n_rot, pos, compress_ratio, + rope_type, rope_cfg, norm_eps); +} + +static dsv4_decode_compressor dsv4_build_compressor_decode_projected( + ggml_context * ctx, + ggml_tensor * kv_cur, + ggml_tensor * sc_cur, + ggml_tensor * prev_kv_state, + ggml_tensor * prev_score_state, + ggml_tensor * norm, + int64_t head_dim, + int64_t n_rot, + int64_t pos, + int64_t compress_ratio, + int rope_type, + const dsv4_rope_cfg & rope_cfg, + float norm_eps) { + const dsv4_state_layout layout = dsv4_make_state_layout(compress_ratio, head_dim); + const int64_t pos_mod = pos % compress_ratio; + const int64_t row = compress_ratio == 4 ? compress_ratio + pos_mod : pos_mod; + const bool should_compress = (pos + 1) % compress_ratio == 0; + + // Single-row write via cpy-into-view. ggml_set_rows would crash on + // multi-GPU (sched routes by src device while dst is on a different + // device; see dsv4_store_cache_rows for the same problem and fix). + // + // We need to return a FULL-shape view of dst (downstream code at + // dsv4_view_cols slices the full state by columns/rows) AND establish + // a data dependency on the cpy. We mimic ggml_set_rows's internal + // construction: create a view_tensor of dst (which inherits dst's full + // shape), then manually set src[0] to the cpy result so sched orders + // the cpy before any consumer reading from this view. + auto cpy_into_row = [&](ggml_tensor * dst, ggml_tensor * row_src) -> ggml_tensor * { + ggml_tensor * row_view = ggml_view_2d(ctx, dst, + dst->ne[0], 1, + dst->nb[1], + row * dst->nb[1]); + ggml_tensor * cpy = ggml_cpy(ctx, row_src, row_view); + ggml_tensor * full_state = ggml_view_tensor(ctx, dst); + full_state->src[0] = cpy; // dependency: full_state's consumers wait for cpy + return full_state; + }; + ggml_tensor * kv_state = cpy_into_row(prev_kv_state, kv_cur); + ggml_tensor * score_state = cpy_into_row(prev_score_state, sc_cur); + ggml_tensor * kv_comp = nullptr; + + if (should_compress) { + ggml_tensor * kv_pool; + ggml_tensor * score_pool; + + if (compress_ratio == 4) { + ggml_tensor * kv_prev = dsv4_view_cols(ctx, kv_state, head_dim, compress_ratio, 0, 0); + ggml_tensor * kv_curr = dsv4_view_cols(ctx, kv_state, head_dim, compress_ratio, head_dim, compress_ratio); + ggml_tensor * sc_prev = dsv4_view_cols(ctx, score_state, head_dim, compress_ratio, 0, 0); + ggml_tensor * sc_curr = dsv4_view_cols(ctx, score_state, head_dim, compress_ratio, head_dim, compress_ratio); + + kv_pool = ggml_concat(ctx, kv_prev, kv_curr, 1); + score_pool = ggml_concat(ctx, sc_prev, sc_curr, 1); + + ggml_tensor * shifted_kv = dsv4_view_cols(ctx, kv_state, layout.width, compress_ratio, 0, compress_ratio); + ggml_tensor * shifted_score = dsv4_view_cols(ctx, score_state, layout.width, compress_ratio, 0, compress_ratio); + kv_state = ggml_concat(ctx, shifted_kv, shifted_kv, 1); + score_state = ggml_concat(ctx, shifted_score, shifted_score, 1); + } else { + kv_pool = kv_state; + score_pool = score_state; + } + + ggml_tensor * comp_pos = dsv4_arange_i32(ctx, pos + 1 - compress_ratio, pos + 2 - compress_ratio); + kv_comp = dsv4_pool_decode_state(ctx, kv_pool, score_pool, norm, comp_pos, + head_dim, n_rot, rope_type, rope_cfg, norm_eps); + } + + return { kv_state, score_state, kv_comp }; +} + +static dsv4_decode_compressor dsv4_build_compressor_decode_chunk( + ggml_context * ctx, + ggml_tensor * x, + ggml_tensor * prev_kv_state, + ggml_tensor * prev_score_state, + ggml_tensor * wkv, + ggml_tensor * wgate, + ggml_tensor * ape, + ggml_tensor * norm, + const llama_ubatch & ubatch, + int64_t head_dim, + int64_t n_rot, + int64_t n_tokens, + int64_t compress_ratio, + int rope_type, + const dsv4_rope_cfg & rope_cfg, + float norm_eps) { + const dsv4_state_layout layout = dsv4_make_state_layout(compress_ratio, head_dim); + + ggml_tensor * kv_all = ggml_mul_mat(ctx, wkv, x); // [width, n_tokens] + ggml_tensor * sc_all = ggml_mul_mat(ctx, wgate, x); + ggml_tensor * ape_f = ape->type == GGML_TYPE_F32 ? ape : ggml_cast(ctx, ape, GGML_TYPE_F32); + + ggml_tensor * kv_state = prev_kv_state; + ggml_tensor * score_state = prev_score_state; + ggml_tensor * kv_comp = nullptr; + + for (int64_t i = 0; i < n_tokens; ++i) { + const llama_pos pos = ubatch.pos ? ubatch.pos[i] : (llama_pos) i; + const int64_t pos_mod = pos % compress_ratio; + + ggml_tensor * kv_cur = ggml_view_2d(ctx, kv_all, layout.width, 1, kv_all->nb[1], i*kv_all->nb[1]); + ggml_tensor * sc_cur = ggml_view_2d(ctx, sc_all, layout.width, 1, sc_all->nb[1], i*sc_all->nb[1]); + sc_cur = ggml_add(ctx, sc_cur, ggml_view_2d(ctx, ape_f, layout.width, 1, ape_f->nb[1], pos_mod*ape_f->nb[1])); + + dsv4_decode_compressor dec = dsv4_build_compressor_decode_projected(ctx, + kv_cur, + sc_cur, + kv_state, + score_state, + norm, + head_dim, + n_rot, + pos, + compress_ratio, + rope_type, + rope_cfg, + norm_eps); + + kv_state = dec.kv_state; + score_state = dec.score_state; + if (dec.kv_comp != nullptr) { + kv_comp = kv_comp == nullptr ? dec.kv_comp : ggml_concat(ctx, kv_comp, dec.kv_comp, 2); + } + } + + return { kv_state, score_state, kv_comp }; +} + +static ggml_tensor * dsv4_build_indexer_scores_prefill( + ggml_context * ctx, + ggml_tensor * x, + ggml_tensor * qr, + ggml_tensor * index_kv, + ggml_tensor * wq_b, + ggml_tensor * wproj, + ggml_tensor * pos, + ggml_tensor * causal_mask, + int64_t n_index_head, + int64_t n_index_head_size, + int64_t n_tokens, + int64_t n_rot, + int rope_type, + const dsv4_rope_cfg & rope_cfg) { + ggml_tensor * q = ggml_mul_mat(ctx, wq_b, qr); + q = ggml_reshape_3d(ctx, q, n_index_head_size, n_index_head, n_tokens); + q = dsv4_apply_rope_tail(ctx, q, pos, + n_index_head_size, n_index_head, n_tokens, n_rot, rope_type, + rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale, + rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, false); + + ggml_tensor * k = ggml_permute(ctx, index_kv, 0, 2, 1, 3); // [head_dim, n_comp, 1] + q = ggml_permute(ctx, q, 0, 2, 1, 3); // [head_dim, n_tokens, n_heads] + + ggml_tensor * score = ggml_mul_mat(ctx, k, q); // [n_comp, n_tokens, n_heads] + score = ggml_relu(ctx, score); + + ggml_tensor * weights = ggml_mul_mat(ctx, wproj, x); // [n_heads, n_tokens] + const float scale = 1.0f / std::sqrt(float(n_index_head_size) * float(n_index_head)); + weights = dsv4_mul_scalar(ctx, weights, scale); + weights = ggml_reshape_3d(ctx, weights, 1, n_index_head, n_tokens); + weights = ggml_permute(ctx, weights, 0, 2, 1, 3); // [1, n_tokens, n_heads] + + score = ggml_mul(ctx, score, weights); + score = ggml_cont(ctx, ggml_permute(ctx, score, 1, 2, 0, 3)); // [n_heads, n_comp, n_tokens] + score = ggml_sum_rows(ctx, score); // [1, n_comp, n_tokens] + score = ggml_reshape_2d(ctx, score, index_kv->ne[2], n_tokens); + + return ggml_add(ctx, score, causal_mask); +} + +static ggml_tensor * dsv4_build_indexer_scores_decode( + ggml_context * ctx, + ggml_tensor * x, + ggml_tensor * qr, + ggml_tensor * index_kv, + ggml_tensor * wq_b, + ggml_tensor * wproj, + ggml_tensor * pos, + int64_t n_index_head, + int64_t n_index_head_size, + int64_t n_comp, + int64_t n_rot, + int rope_type, + const dsv4_rope_cfg & rope_cfg) { + ggml_tensor * q = ggml_mul_mat(ctx, wq_b, qr); + q = ggml_reshape_3d(ctx, q, n_index_head_size, n_index_head, 1); + q = dsv4_apply_rope_tail(ctx, q, pos, + n_index_head_size, n_index_head, 1, n_rot, rope_type, + rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale, + rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, false); + + ggml_tensor * k = ggml_reshape_3d(ctx, index_kv, n_index_head_size, 1, n_comp); + k = ggml_permute(ctx, k, 0, 2, 1, 3); // [head_dim, n_comp, 1] + q = ggml_permute(ctx, q, 0, 2, 1, 3); // [head_dim, 1, n_heads] + + ggml_tensor * score = ggml_mul_mat(ctx, k, q); // [n_comp, 1, n_heads] + score = ggml_relu(ctx, score); + + ggml_tensor * weights = ggml_mul_mat(ctx, wproj, x); // [n_heads, 1] + const float scale = 1.0f / std::sqrt(float(n_index_head_size) * float(n_index_head)); + weights = dsv4_mul_scalar(ctx, weights, scale); + weights = ggml_reshape_3d(ctx, weights, 1, n_index_head, 1); + weights = ggml_permute(ctx, weights, 0, 2, 1, 3); // [1, 1, n_heads] + + score = ggml_mul(ctx, score, weights); + score = ggml_cont(ctx, ggml_permute(ctx, score, 1, 2, 0, 3)); // [n_heads, n_comp, 1] + score = ggml_sum_rows(ctx, score); + return ggml_reshape_2d(ctx, score, n_comp, 1); +} + +static ggml_tensor * dsv4_build_compressed_mask_from_topk( + ggml_context * ctx, + ggml_tensor * scores, + ggml_tensor * topk) { + const int64_t n_comp = scores->ne[0]; + const int64_t n_tokens = scores->ne[1]; + + ggml_tensor * scores_rows = ggml_reshape_3d(ctx, scores, 1, scores->ne[0], scores->ne[1]); + ggml_tensor * selected_scores = ggml_get_rows(ctx, scores_rows, topk); // [1, top_k, n_tokens] + ggml_tensor * valid = ggml_step(ctx, dsv4_add_scalar(ctx, selected_scores, 1.0e30f)); + ggml_tensor * values = dsv4_mul_scalar(ctx, dsv4_add_scalar(ctx, valid, -1.0f), 1.0e9f); + + ggml_tensor * mask = dsv4_new_filled_3d(ctx, 1, n_comp, n_tokens, -INFINITY); + mask = ggml_set_rows(ctx, mask, values, topk); + return ggml_reshape_2d(ctx, mask, n_comp, n_tokens); +} + +static ggml_tensor * dsv4_cache_view_3d(ggml_context * ctx, ggml_tensor * cache, int64_t n_rows) { + ggml_tensor * view = ggml_view_2d(ctx, cache, cache->ne[0], n_rows, cache->nb[1], 0); + return ggml_reshape_3d(ctx, view, cache->ne[0], 1, n_rows); +} + +} // namespace + +llama_model_deepseek4::graph::graph(const llama_model & model, const llm_graph_params & params) : + llm_graph_context(params) { + + const int64_t n_hc = hparams.n_hc; + const int64_t n_lora_q = hparams.n_lora_q; + const int64_t n_lora_o = hparams.n_lora_o; + const int64_t n_out_group = hparams.n_attn_out_groups; + + GGML_ASSERT(n_hc > 0); + GGML_ASSERT(n_lora_q > 0); + GGML_ASSERT(n_lora_o > 0); + GGML_ASSERT(n_out_group > 0); + GGML_ASSERT(n_embd_head_k == n_embd_head_v); + ggml_tensor * inpL = build_inp_embd(model.tok_embd); + ggml_tensor * inp_tokens = res->t_inp_tokens; + ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + auto * inp_mem = build_inp_mem_hybrid_iswa(); + auto * inp_attn = inp_mem->get_attn(); + auto * inp_rs = inp_mem->get_recr(); + const auto * mctx_dsv4 = inp_mem->mctx; + dsv4_graph_inputs * inp_dsv4 = nullptr; + auto get_dsv4_inputs = [&]() { + if (inp_dsv4 == nullptr) { + auto inputs = std::make_unique(); + inp_dsv4 = inputs.get(); + res->add_input(std::move(inputs)); + } + return inp_dsv4; + }; + + inpL = ggml_reshape_3d(ctx0, inpL, n_embd, 1, n_tokens); + inpL = ggml_repeat_4d(ctx0, inpL, n_embd, n_hc, n_tokens, 1); + inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_hc, n_tokens); + + const float kq_scale = 1.0f / std::sqrt(float(n_embd_head_k)); + + for (int il = 0; il < n_layer; ++il) { + const auto & layer = model.layers[il]; + const uint32_t compress_ratio = hparams.attn_compress_ratio[il]; + const dsv4_rope_cfg rope_cfg = dsv4_make_rope_cfg(hparams, cparams, compress_ratio); + const bool is_prefill = ubatch.pos == nullptr || ubatch.pos[0] == 0; + + if (compress_ratio != 0) { + if (compress_ratio != 4 && compress_ratio != 128) { + throw std::runtime_error("DeepSeek V4 unsupported attention compression ratio " + std::to_string(compress_ratio)); + } + // The hybrid memory splitter emits one sequence set per ubatch + // for compressed DeepSeek V4 attention. + GGML_ASSERT(ubatch.n_seqs == 1); + } + + ggml_tensor * residual = inpL; + dsv4_hc_mix mix = dsv4_hc_pre(ctx0, inpL, + layer.hc_attn_fn, layer.hc_attn_scale, layer.hc_attn_base, + n_embd, n_hc, n_tokens, norm_rms_eps, hparams.hc_sinkhorn_iters, hparams.hc_eps); + ggml_tensor * cur = mix.x; + cb(cur, "hc_attn_pre", il); + cb(mix.mixes, "hc_attn_pre_mixes", il); + cb(mix.pre, "hc_attn_pre_weights", il); + cb(mix.post, "hc_attn_pre_post_weights", il); + cb(mix.comb, "hc_attn_pre_comb", il); + cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + ggml_tensor * qr = ggml_mul_mat(ctx0, layer.wq_a, cur); + cb(qr, "q_lora", il); + qr = build_norm(qr, layer.attn_q_a_norm, nullptr, LLM_NORM_RMS, il); + cb(qr, "q_lora_norm", il); + + ggml_tensor * q = ggml_mul_mat(ctx0, layer.wq_b, qr); + q = ggml_reshape_3d(ctx0, q, n_embd_head_k, n_head, n_tokens); + q = ggml_rms_norm(ctx0, q, norm_rms_eps); + cb(q, "Qnorm", il); + q = dsv4_apply_rope_tail(ctx0, q, inp_pos, + n_embd_head_k, n_head, n_tokens, n_rot, rope_type, + rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale, + rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, false); + cb(q, "Qcur", il); + ggml_tensor * kv = ggml_mul_mat(ctx0, layer.attn_kv, cur); + kv = build_norm(kv, layer.attn_kv_a_norm, nullptr, LLM_NORM_RMS, il); + kv = ggml_reshape_3d(ctx0, kv, n_embd_head_k, 1, n_tokens); + cb(kv, "KVnorm", il); + kv = dsv4_apply_rope_tail(ctx0, kv, inp_pos, + n_embd_head_k, 1, n_tokens, n_rot, rope_type, + rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale, + rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, false); + cb(kv, "KVrope", il); + kv = ggml_dsv4_fp8_kv_quantize(ctx0, kv, n_rot); + cb(kv, "KVcur", il); + + const auto * mctx_swa = inp_attn->mctx->get_swa(); + ggml_build_forward_expand(gf, q); + ggml_build_forward_expand(gf, kv); + ggml_build_forward_expand(gf, mctx_swa->cpy_k(ctx0, kv, inp_attn->get_k_idxs_swa(), il)); + + if (compress_ratio == 0) { + ggml_tensor * k_cache = mctx_swa->get_k(ctx0, il); + k_cache = ggml_reshape_3d(ctx0, k_cache, n_embd_head_k, 1, k_cache->ne[2]); + cur = build_attn_mha(q, k_cache, k_cache, nullptr, inp_attn->get_kq_mask_swa(), + layer.attn_sinks, nullptr, kq_scale, il); + cb(cur, "kqv_out", il); + } else { + ggml_tensor * k_all = kv; + ggml_tensor * v_all = kv; + ggml_tensor * attn_mask = nullptr; + const llama_seq_id seq_id = ubatch.seq_id[0][0]; + auto store_attn_cache_rows = [&](ggml_tensor * src, int64_t row_start, int64_t n_rows) { + for (int32_t is = 0; is < ubatch.n_seq_id[0]; ++is) { + const llama_seq_id dst_seq_id = ubatch.seq_id[0][is]; + dsv4_store_cache_rows(ctx0, gf, mctx_dsv4->get_dsv4_attn_k(ctx0, il, dst_seq_id), src, row_start, n_rows); + } + }; + auto store_index_cache_rows = [&](ggml_tensor * src, int64_t row_start, int64_t n_rows) { + for (int32_t is = 0; is < ubatch.n_seq_id[0]; ++is) { + const llama_seq_id dst_seq_id = ubatch.seq_id[0][is]; + dsv4_store_cache_rows(ctx0, gf, mctx_dsv4->get_dsv4_index_k(ctx0, il, dst_seq_id), src, row_start, n_rows); + } + }; + const int64_t state_size = hparams.n_embd_r(); + const dsv4_state_layout attn_state_layout = dsv4_make_state_layout(compress_ratio, n_embd_head_k); + + ggml_tensor * prev_kv_state_all = build_rs(inp_rs, inp_rs->mctx->get_r_l(il), state_size, ubatch.n_seqs); + ggml_tensor * prev_sc_state_all = build_rs(inp_rs, inp_rs->mctx->get_s_l(il), state_size, ubatch.n_seqs); + ggml_tensor * prev_attn_kv_state = dsv4_view_state_segment(ctx0, prev_kv_state_all, 0, attn_state_layout.width, attn_state_layout.rows); + ggml_tensor * prev_attn_sc_state = dsv4_view_state_segment(ctx0, prev_sc_state_all, 0, attn_state_layout.width, attn_state_layout.rows); + + const int64_t n_comp = n_tokens / compress_ratio; + if (is_prefill) { + dsv4_state_pair state = dsv4_build_compressor_prefill_state(ctx0, cur, + layer.attn_compressor_kv, + layer.attn_compressor_gate, + layer.attn_compressor_ape, + n_embd_head_k, + n_tokens, + compress_ratio); + dsv4_store_state_segment(ctx0, gf, state.kv, inp_rs->mctx->get_r_l(il), state_size, inp_rs->head, 0); + dsv4_store_state_segment(ctx0, gf, state.score, inp_rs->mctx->get_s_l(il), state_size, inp_rs->head, 0); + + if (compress_ratio == 4) { + const dsv4_state_layout index_state_layout = dsv4_make_state_layout(compress_ratio, hparams.indexer_head_size); + dsv4_state_pair index_state = dsv4_build_compressor_prefill_state(ctx0, cur, + layer.indexer_compressor_kv, + layer.indexer_compressor_gate, + layer.indexer_compressor_ape, + hparams.indexer_head_size, + n_tokens, + compress_ratio); + dsv4_store_state_segment(ctx0, gf, index_state.kv, inp_rs->mctx->get_r_l(il), state_size, inp_rs->head, attn_state_layout.elems); + dsv4_store_state_segment(ctx0, gf, index_state.score, inp_rs->mctx->get_s_l(il), state_size, inp_rs->head, attn_state_layout.elems); + GGML_ASSERT(attn_state_layout.elems + index_state_layout.elems <= state_size); + } + } + + if (is_prefill && n_comp > 0) { + ggml_tensor * comp_pos = ggml_arange(ctx0, 0.0f, float(n_comp * compress_ratio), float(compress_ratio)); + comp_pos = ggml_cast(ctx0, comp_pos, GGML_TYPE_I32); + + ggml_tensor * kv_comp = dsv4_build_compressor_prefill(ctx0, cur, + layer.attn_compressor_kv, + layer.attn_compressor_gate, + layer.attn_compressor_ape, + layer.attn_compressor_norm, + comp_pos, + n_embd_head_k, n_rot, n_tokens, compress_ratio, rope_type, rope_cfg, norm_rms_eps); + kv_comp = ggml_dsv4_fp8_kv_quantize(ctx0, kv_comp, n_rot); + cb(kv_comp, "KVcompress", il); + + store_attn_cache_rows(kv_comp, 0, n_comp); + + k_all = ggml_concat(ctx0, kv, kv_comp, 2); + v_all = k_all; + + if (compress_ratio == 4) { + ggml_tensor * raw_mask = get_dsv4_inputs()->add_mask(ctx0, + dsv4_mask_kind::RAW_WINDOW, + n_tokens, n_tokens, + n_tokens, n_comp, hparams.n_swa, compress_ratio, + "dsv4_attn_raw_window_mask"); + ggml_tensor * index_mask = get_dsv4_inputs()->add_mask(ctx0, + dsv4_mask_kind::COMPRESS_CAUSAL, + n_comp, n_tokens, + 0, n_comp, 0, compress_ratio, + "dsv4_indexer_causal_mask"); + + ggml_tensor * index_kv = dsv4_build_compressor_prefill(ctx0, cur, + layer.indexer_compressor_kv, + layer.indexer_compressor_gate, + layer.indexer_compressor_ape, + layer.indexer_compressor_norm, + comp_pos, + hparams.indexer_head_size, n_rot, n_tokens, compress_ratio, rope_type, rope_cfg, norm_rms_eps); + cb(index_kv, "indexer_KVcompress", il); + + store_index_cache_rows(index_kv, 0, n_comp); + + ggml_tensor * index_scores = dsv4_build_indexer_scores_prefill(ctx0, + cur, qr, index_kv, + layer.indexer_attn_q_b, + layer.indexer_proj, + inp_pos, + index_mask, + hparams.indexer_n_head, + hparams.indexer_head_size, + n_tokens, + n_rot, + rope_type, + rope_cfg); + cb(index_scores, "indexer_scores", il); + + const int top_k = std::min(hparams.indexer_top_k, n_comp); + ggml_tensor * topk = ggml_argsort_top_k(ctx0, index_scores, top_k); + cb(topk, "indexer_topk", il); + + ggml_tensor * comp_mask = dsv4_build_compressed_mask_from_topk(ctx0, index_scores, topk); + cb(comp_mask, "dsv4_attn_compress_mask", il); + + attn_mask = ggml_concat(ctx0, raw_mask, comp_mask, 0); + } else { + attn_mask = get_dsv4_inputs()->add_mask(ctx0, + dsv4_mask_kind::ATTN_STATIC, + n_tokens + n_comp, n_tokens, + n_tokens, n_comp, hparams.n_swa, compress_ratio, + "dsv4_attn_static_mask"); + } + } else { + attn_mask = get_dsv4_inputs()->add_mask(ctx0, + dsv4_mask_kind::RAW_WINDOW, + n_tokens, n_tokens, + n_tokens, 0, hparams.n_swa, compress_ratio, + "dsv4_attn_raw_window_mask"); + } + + if (!is_prefill) { + const llama_pos first_pos = ubatch.pos ? ubatch.pos[0] : 0; + const llama_pos last_pos = ubatch.pos ? ubatch.pos[n_tokens - 1] : n_tokens - 1; + const int64_t n_comp_before = first_pos / compress_ratio; + const int64_t n_comp_visible = (last_pos + 1) / compress_ratio; + const int64_t n_comp_cache = mctx_dsv4->get_dsv4_n_comp(il); + GGML_ASSERT(n_comp_visible <= n_comp_cache); + + dsv4_decode_compressor dec = n_tokens == 1 + ? dsv4_build_compressor_decode(ctx0, cur, + prev_attn_kv_state, + prev_attn_sc_state, + layer.attn_compressor_kv, + layer.attn_compressor_gate, + layer.attn_compressor_ape, + layer.attn_compressor_norm, + n_embd_head_k, + n_rot, + first_pos, + compress_ratio, + rope_type, + rope_cfg, + norm_rms_eps) + : dsv4_build_compressor_decode_chunk(ctx0, cur, + prev_attn_kv_state, + prev_attn_sc_state, + layer.attn_compressor_kv, + layer.attn_compressor_gate, + layer.attn_compressor_ape, + layer.attn_compressor_norm, + ubatch, + n_embd_head_k, + n_rot, + n_tokens, + compress_ratio, + rope_type, + rope_cfg, + norm_rms_eps); + + dsv4_store_state_segment(ctx0, gf, dec.kv_state, inp_rs->mctx->get_r_l(il), state_size, inp_rs->head, 0); + dsv4_store_state_segment(ctx0, gf, dec.score_state, inp_rs->mctx->get_s_l(il), state_size, inp_rs->head, 0); + + if (dec.kv_comp != nullptr) { + dec.kv_comp = ggml_dsv4_fp8_kv_quantize(ctx0, dec.kv_comp, n_rot); + store_attn_cache_rows(dec.kv_comp, n_comp_before, n_comp_visible - n_comp_before); + } + + ggml_tensor * k_raw = mctx_swa->get_k(ctx0, il); + k_raw = ggml_reshape_3d(ctx0, k_raw, n_embd_head_k, 1, k_raw->ne[2]); + k_all = k_raw; + v_all = k_raw; + attn_mask = inp_attn->self_kq_mask_swa; + + if (n_comp_visible > 0) { + ggml_tensor * kv_comp_cache = dsv4_cache_view_3d(ctx0, mctx_dsv4->get_dsv4_attn_k(ctx0, il, seq_id), n_comp_visible); + // V4's KV cache is F16 (forced via llama-model.cpp); CUDA's + // ggml_concat asserts F32 (ggml-cuda/concat.cu) — every other + // architecture's concat takes F32 inputs from mul_mat/norm/rope, + // so the assertion is correct. Cast both F16 inputs to F32 for + // the concat, then cast the result back to F16 to preserve the + // f16-KV-pin invariant for downstream attention. Metal's concat + // is type-agnostic; on Metal these casts are accepted but the + // intermediate F32 round-trip is wasted work. CPU concat handles + // both types so it's also a no-op cost there. + ggml_tensor * k_raw_f32 = ggml_cast(ctx0, k_raw, GGML_TYPE_F32); + ggml_tensor * comp_f32 = ggml_cast(ctx0, kv_comp_cache, GGML_TYPE_F32); + ggml_tensor * concat_f32 = ggml_concat(ctx0, k_raw_f32, comp_f32, 2); + k_all = ggml_cast(ctx0, concat_f32, GGML_TYPE_F16); + v_all = k_all; + + ggml_tensor * comp_mask = nullptr; + if (compress_ratio == 4) { + const dsv4_state_layout index_state_layout = dsv4_make_state_layout(compress_ratio, hparams.indexer_head_size); + ggml_tensor * prev_index_kv_state = dsv4_view_state_segment(ctx0, prev_kv_state_all, + attn_state_layout.elems, index_state_layout.width, index_state_layout.rows); + ggml_tensor * prev_index_sc_state = dsv4_view_state_segment(ctx0, prev_sc_state_all, + attn_state_layout.elems, index_state_layout.width, index_state_layout.rows); + + dsv4_decode_compressor index_dec = n_tokens == 1 + ? dsv4_build_compressor_decode(ctx0, cur, + prev_index_kv_state, + prev_index_sc_state, + layer.indexer_compressor_kv, + layer.indexer_compressor_gate, + layer.indexer_compressor_ape, + layer.indexer_compressor_norm, + hparams.indexer_head_size, + n_rot, + first_pos, + compress_ratio, + rope_type, + rope_cfg, + norm_rms_eps) + : dsv4_build_compressor_decode_chunk(ctx0, cur, + prev_index_kv_state, + prev_index_sc_state, + layer.indexer_compressor_kv, + layer.indexer_compressor_gate, + layer.indexer_compressor_ape, + layer.indexer_compressor_norm, + ubatch, + hparams.indexer_head_size, + n_rot, + n_tokens, + compress_ratio, + rope_type, + rope_cfg, + norm_rms_eps); + + dsv4_store_state_segment(ctx0, gf, index_dec.kv_state, inp_rs->mctx->get_r_l(il), state_size, inp_rs->head, attn_state_layout.elems); + dsv4_store_state_segment(ctx0, gf, index_dec.score_state, inp_rs->mctx->get_s_l(il), state_size, inp_rs->head, attn_state_layout.elems); + + if (index_dec.kv_comp != nullptr) { + store_index_cache_rows(index_dec.kv_comp, n_comp_before, n_comp_visible - n_comp_before); + } + + if (n_tokens == 1 && n_comp_visible <= hparams.indexer_top_k) { + comp_mask = get_dsv4_inputs()->add_mask(ctx0, + dsv4_mask_kind::COMPRESS_CAUSAL, + n_comp_visible, n_tokens, + 0, n_comp_visible, 0, compress_ratio, + "dsv4_attn_compress_mask"); + } else { + ggml_tensor * index_cache = dsv4_cache_view_3d(ctx0, mctx_dsv4->get_dsv4_index_k(ctx0, il, seq_id), n_comp_visible); + index_cache = ggml_reshape_2d(ctx0, index_cache, hparams.indexer_head_size, n_comp_visible); + ggml_tensor * index_scores = n_tokens == 1 + ? dsv4_build_indexer_scores_decode(ctx0, + cur, qr, index_cache, + layer.indexer_attn_q_b, + layer.indexer_proj, + inp_pos, + hparams.indexer_n_head, + hparams.indexer_head_size, + n_comp_visible, + n_rot, + rope_type, + rope_cfg) + : dsv4_build_indexer_scores_prefill(ctx0, + cur, qr, dsv4_cache_view_3d(ctx0, mctx_dsv4->get_dsv4_index_k(ctx0, il, seq_id), n_comp_visible), + layer.indexer_attn_q_b, + layer.indexer_proj, + inp_pos, + get_dsv4_inputs()->add_mask(ctx0, + dsv4_mask_kind::COMPRESS_CAUSAL, + n_comp_visible, n_tokens, + 0, n_comp_visible, 0, compress_ratio, + "dsv4_indexer_decode_causal_mask"), + hparams.indexer_n_head, + hparams.indexer_head_size, + n_tokens, + n_rot, + rope_type, + rope_cfg); + cb(index_scores, "indexer_scores", il); + + const int top_k = std::min(hparams.indexer_top_k, n_comp_visible); + ggml_tensor * topk = ggml_argsort_top_k(ctx0, index_scores, top_k); + cb(topk, "indexer_topk", il); + + comp_mask = dsv4_build_compressed_mask_from_topk(ctx0, index_scores, topk); + } + } else { + comp_mask = get_dsv4_inputs()->add_mask(ctx0, + dsv4_mask_kind::COMPRESS_CAUSAL, + n_comp_visible, n_tokens, + 0, n_comp_visible, 0, compress_ratio, + "dsv4_attn_compress_mask"); + } + + attn_mask = ggml_concat(ctx0, attn_mask, comp_mask, 0); + } + } + + ggml_tensor * attn_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, attn_mask, GGML_TYPE_F16) : attn_mask; + cur = build_attn_mha(q, k_all, v_all, nullptr, attn_mask_cnv, layer.attn_sinks, nullptr, kq_scale, il); + cb(cur, "kqv_out", il); + } + cur = ggml_reshape_3d(ctx0, cur, n_embd_head_v, n_head, n_tokens); + cur = dsv4_apply_rope_tail(ctx0, cur, inp_pos, + n_embd_head_v, n_head, n_tokens, n_rot, rope_type, + rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale, + rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, true); + cur = dsv4_grouped_out(ctx0, cur, layer.attn_wo_a, layer.attn_wo_b, + n_embd_head_v, n_head, n_out_group, n_lora_o, n_tokens); + cb(cur, "attn_out", il); + inpL = dsv4_hc_post(ctx0, cur, residual, mix.post, mix.comb, n_embd, n_hc, n_tokens); + cb(inpL, "hc_attn_post", il); + + residual = inpL; + mix = dsv4_hc_pre(ctx0, inpL, + layer.hc_ffn_fn, layer.hc_ffn_scale, layer.hc_ffn_base, + n_embd, n_hc, n_tokens, norm_rms_eps, hparams.hc_sinkhorn_iters, hparams.hc_eps); + cur = mix.x; + cb(cur, "hc_ffn_pre", il); + cb(mix.mixes, "hc_ffn_pre_mixes", il); + cb(mix.pre, "hc_ffn_pre_weights", il); + cb(mix.post, "hc_ffn_pre_post_weights", il); + cb(mix.comb, "hc_ffn_pre_comb", il); + cur = build_norm(cur, layer.ffn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + ggml_tensor * selected = nullptr; + if ((uint32_t) il < hparams.n_hash_layers && !cparams.warmup) { + GGML_ASSERT(inp_tokens != nullptr && + "DeepSeek V4 hash routing requires token-id input; embedding-only / multimodal input not supported"); + selected = ggml_get_rows(ctx0, layer.ffn_gate_tid2eid, inp_tokens); + cb(selected, "ffn_moe_hash_topk", il); + } + + ggml_tensor * moe_out = build_moe_ffn(cur, + layer.ffn_gate_inp, + layer.ffn_up_exps, + layer.ffn_gate_exps, + layer.ffn_down_exps, + layer.ffn_exp_probs_b, + n_expert, n_expert_used, + LLM_FFN_SILU, hparams.expert_weights_norm, + hparams.expert_weights_scale, + (llama_expert_gating_func_type) hparams.expert_gating_func, + il, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + selected); + cb(moe_out, "ffn_moe_out", il); + ggml_tensor * ffn_shexp = build_ffn(cur, + layer.ffn_up_shexp, nullptr, nullptr, + layer.ffn_gate_shexp, nullptr, nullptr, + layer.ffn_down_shexp, nullptr, nullptr, + nullptr, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(ffn_shexp, "ffn_shexp", il); + + cur = ggml_add(ctx0, moe_out, ffn_shexp); + cb(cur, "ffn_out", il); + inpL = dsv4_hc_post(ctx0, cur, residual, mix.post, mix.comb, n_embd, n_hc, n_tokens); + cb(inpL, "hc_ffn_post", il); + } + if (inp_out_ids) { + inpL = ggml_reshape_2d(ctx0, inpL, n_embd * n_hc, n_tokens); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_hc, n_outputs); + } + + ggml_tensor * cur = dsv4_hc_head(ctx0, inpL, + model.output_hc_fn, model.output_hc_scale, model.output_hc_base, + n_embd, n_hc, inp_out_ids ? n_outputs : n_tokens, + norm_rms_eps, hparams.hc_eps); + cb(cur, "result_hc", -1); + + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res->t_embd = cur; + + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + res->t_logits = cur; + ggml_build_forward_expand(gf, cur); +} diff --git a/src/models/models.h b/src/models/models.h index 4e40536a5ea3..2ebf8666dcbd 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -1020,6 +1020,19 @@ struct llama_model_deepseek2 : public llama_model_base { }; +struct llama_model_deepseek4 : public llama_model_base { + llama_model_deepseek4(const struct llama_model_params & params) : llama_model_base(params) {} + void load_arch_hparams(llama_model_loader & ml) override; + void load_arch_tensors(llama_model_loader & ml) override; + + struct graph : public llm_graph_context { + graph(const llama_model & model, const llm_graph_params & params); + }; + + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; +}; + + struct llama_model_deepseek2ocr : public llama_model_base { llama_model_deepseek2ocr(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 76f7cb5a867d..24905b43d8af 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -4951,6 +4951,271 @@ struct test_rope : public test_case { } }; +// V4 partial-RoPE: leaves the non-RoPE prefix unchanged, applies RoPE to the tail. +// Reference: ggml/include/ggml.h (ggml_dsv4_rope_tail). +// CPU fallback: ggml/src/ggml-cpu/ops.cpp (ggml_compute_forward_dsv4_rope_tail). +// Constraints (ggml.c, ggml_dsv4_rope_tail): mode in {NORMAL, NEOX}; +// a->ne[2] == pos->ne[0]; n_dims > 0 && n_dims <= a->ne[0] && n_dims % 2 == 0; +// if freq_factors, freq_factors->ne[0] >= n_dims/2. +struct test_dsv4_rope_tail : public test_case { + const ggml_type type; + const std::array ne_a; + int n_dims; + int mode; + int n_ctx; + float fs; // freq_scale + float ef; // ext_factor + float af; // attn_factor + bool ff; // use freq_factors + bool inverse; + + std::string vars() override { + return VARS_TO_STR10(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, inverse); + } + + test_dsv4_rope_tail(ggml_type type = GGML_TYPE_F32, + std::array ne_a = {64, 8, 4, 1}, + int n_dims = 32, int mode = GGML_ROPE_TYPE_NORMAL, int n_ctx = 128, + float fs = 1.0f, float ef = 0.0f, float af = 0.0f, + bool ff = false, bool inverse = false) + : type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), + fs(fs), ef(ef), af(af), ff(ff), inverse(inverse) {} + + // NMSE tolerance: 1e-5. Rationale: RoPE is trig + multiply, no + // accumulation. Matches test_rope's de-facto behavior on this backend pair. + double max_nmse_err() override { + return 1e-5; + } + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data()); + ggml_set_param(a); + ggml_set_name(a, "a"); + + // Constraint: a->ne[2] == pos->ne[0]. + ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]); + ggml_set_name(pos, "pos"); + + ggml_tensor * freq = nullptr; + if (ff) { + // Constraint: freq_factors->ne[0] >= n_dims/2. + freq = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims / 2); + ggml_set_name(freq, "freq"); + } + + ggml_tensor * out = ggml_dsv4_rope_tail( + ctx, a, pos, freq, + n_dims, mode, n_ctx, + 10000.0f, fs, ef, af, 1.0f, 1.0f, + inverse); + ggml_set_name(out, "out"); + return out; + } + + void initialize_tensors(ggml_context * ctx) override { + // Match test_rope's pattern: positions are random within [0, n_ctx) so + // the test exercises a representative distribution of RoPE phases on + // every run, not just sequential 0..N-1. + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + if (t->type == GGML_TYPE_I32) { + std::vector data(ggml_nelements(t)); + for (size_t i = 0; i < data.size(); ++i) { + data[i] = rand() % n_ctx; + } + ggml_backend_tensor_set(t, data.data(), 0, data.size() * sizeof(int)); + } else { + init_tensor_uniform(t, -1.0f, 1.0f); + } + } + } +}; + +// V4 hyper-connection splitter with Sinkhorn normalization. +// Reference: ggml/include/ggml.h (ggml_dsv4_hc_split_sinkhorn). +// CPU fallback: ggml/src/ggml-cpu/ops.cpp (ggml_compute_forward_dsv4_hc_split_sinkhorn). +// Constraints (ggml.c, ggml_dsv4_hc_split_sinkhorn): mixes->ne[0] == (2 + n_hc) * n_hc; +// mixes->ne[2] == 1; mixes->ne[3] == 1; nelements(scale) >= 3; +// nelements(base) >= mixes->ne[0]. +struct test_dsv4_hc_split_sinkhorn : public test_case { + const int n_hc; + const int64_t n_rows; + const int sinkhorn_iters; + const float eps; + + std::string vars() override { + return VARS_TO_STR4(n_hc, n_rows, sinkhorn_iters, eps); + } + + test_dsv4_hc_split_sinkhorn(int n_hc = 4, int64_t n_rows = 16, + int sinkhorn_iters = 4, float eps = 1e-6f) + : n_hc(n_hc), n_rows(n_rows), sinkhorn_iters(sinkhorn_iters), eps(eps) {} + + // NMSE tolerance: 1e-3. Rationale: 4 iterations of normalization compound + // floating-point rounding; per-iteration eps division amplifies relative + // error on near-zero entries. Spec calls for "1e-3 rel"; NMSE 1e-3 is the + // matching budget. + double max_nmse_err() override { + return 1e-3; + } + + ggml_tensor * build_graph(ggml_context * ctx) override { + // Hard constraint: mixes->ne[0] MUST equal (2 + n_hc) * n_hc. + const int64_t mix_dim = (int64_t)(2 + n_hc) * n_hc; + + ggml_tensor * mixes = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, mix_dim, n_rows); + ggml_set_param(mixes); + ggml_set_name(mixes, "mixes"); + + // scale: nelements(scale) >= 3. Constructor uses scale as a 1D + // parameter buffer. Use a 1D tensor of size 3 (the minimum). + ggml_tensor * scale = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3); + ggml_set_param(scale); + ggml_set_name(scale, "scale"); + + // base: nelements(base) >= mixes->ne[0]. Use a 1D tensor of size mix_dim. + ggml_tensor * base = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, mix_dim); + ggml_set_param(base); + ggml_set_name(base, "base"); + + ggml_tensor * out = ggml_dsv4_hc_split_sinkhorn(ctx, mixes, scale, base, n_hc, sinkhorn_iters, eps); + ggml_set_name(out, "out"); + return out; + } +}; + +// V4 hyper-connection weighted-sum: out[embd, token] = sum_hc weights[hc, token] * x[embd, hc, token]. +// Reference: ggml/include/ggml.h (ggml_dsv4_hc_weighted_sum). +// CPU fallback: ggml/src/ggml-cpu/ops.cpp (ggml_compute_forward_dsv4_hc_weighted_sum). +// Constraints (ggml.c, ggml_dsv4_hc_weighted_sum): +// x shape {n_embd, n_hc, n_tokens, 1} +// weights shape {n_hc, n_tokens, 1, 1} +struct test_dsv4_hc_weighted_sum : public test_case { + const int64_t n_embd; + const int64_t n_hc; + const int64_t n_tokens; + + std::string vars() override { + return VARS_TO_STR3(n_embd, n_hc, n_tokens); + } + + test_dsv4_hc_weighted_sum(int64_t n_embd = 128, int64_t n_hc = 4, int64_t n_tokens = 16) + : n_embd(n_embd), n_hc(n_hc), n_tokens(n_tokens) {} + + // NMSE tolerance: 1e-5. Rationale: weighted sum with n_hc<=16 terms; + // accumulation error is small; pure F32 multiply-add. + double max_nmse_err() override { + return 1e-5; + } + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, n_hc, n_tokens); + ggml_set_param(x); + ggml_set_name(x, "x"); + + ggml_tensor * weights = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_hc, n_tokens); + ggml_set_param(weights); + ggml_set_name(weights, "weights"); + + ggml_tensor * out = ggml_dsv4_hc_weighted_sum(ctx, x, weights); + ggml_set_name(out, "out"); + return out; + } +}; + +// V4 hyper-connection expand: out[embd, hc, token] = post[hc, token] * block_out[embd, token] +// + (comb[:, :, token]^T @ residual[:, :, token])[embd, hc]. +// Reference: ggml/include/ggml.h (ggml_dsv4_hc_expand). +// CPU fallback: ggml/src/ggml-cpu/ops.cpp (ggml_compute_forward_dsv4_hc_expand). +// Constraints (ggml.c, ggml_dsv4_hc_expand): +// block_out shape {n_embd, n_tokens, 1, 1} (2D, NOT 3D) +// residual shape {n_embd, n_hc, n_tokens, 1} +// post shape {n_hc, n_tokens, 1, 1} +// comb shape {n_hc, n_hc, n_tokens, 1} +struct test_dsv4_hc_expand : public test_case { + const int64_t n_embd; + const int64_t n_hc; + const int64_t n_tokens; + + std::string vars() override { + return VARS_TO_STR3(n_embd, n_hc, n_tokens); + } + + test_dsv4_hc_expand(int64_t n_embd = 128, int64_t n_hc = 4, int64_t n_tokens = 16) + : n_embd(n_embd), n_hc(n_hc), n_tokens(n_tokens) {} + + // NMSE tolerance: 1e-5. Rationale: one matmul along n_hc (small) plus a + // pointwise scale; minimal accumulation noise in F32. + double max_nmse_err() override { + return 1e-5; + } + + ggml_tensor * build_graph(ggml_context * ctx) override { + // block_out is 2D: {n_embd, n_tokens}. ne[2]==1, ne[3]==1. + ggml_tensor * block_out = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens); + ggml_set_param(block_out); + ggml_set_name(block_out, "block_out"); + + ggml_tensor * residual = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, n_hc, n_tokens); + ggml_set_param(residual); + ggml_set_name(residual, "residual"); + + ggml_tensor * post = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_hc, n_tokens); + ggml_set_param(post); + ggml_set_name(post, "post"); + + ggml_tensor * comb = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_hc, n_hc, n_tokens); + ggml_set_param(comb); + ggml_set_name(comb, "comb"); + + ggml_tensor * out = ggml_dsv4_hc_expand(ctx, block_out, residual, post, comb); + ggml_set_name(out, "out"); + return out; + } +}; + +// V4 FP8 KV-cache simulation: quantizes/dequantizes the non-RoPE prefix +// in E4M3FN blocks, leaves the RoPE tail unchanged. +// Reference: ggml/include/ggml.h (ggml_dsv4_fp8_kv_quantize). +// CPU fallback: ggml/src/ggml-cpu/ops.cpp (ggml_compute_forward_dsv4_fp8_kv_quantize). +// Constraints (ggml.c, ggml_dsv4_fp8_kv_quantize): n_rot >= 0; a->ne[0] > n_rot; +// (a->ne[0] - n_rot) % 64 == 0 (block size is 64 for the FP8 prefix). +struct test_dsv4_fp8_kv_quantize : public test_case { + const std::array ne_a; + const int n_rot; + + std::string vars() override { + return VARS_TO_STR2(ne_a, n_rot); + } + + test_dsv4_fp8_kv_quantize(std::array ne_a = {192, 8, 4, 1}, + int n_rot = 64) + : ne_a(ne_a), n_rot(n_rot) {} + + // NMSE tolerance: 1e-3. Rationale: FP8 e4m3 represents ~7 bits of mantissa; + // the quantize-dequantize round-trip's NMSE is dominated by representable + // precision, not by accumulation. The spec's "1e-3 abs (FP8 inherently + // lossy)" maps to NMSE 1e-3 because each sample's squared error is bounded + // by the FP8 ULP^2 at the local scale, normalized by signal power yields + // roughly the same order. + double max_nmse_err() override { + return 1e-3; + } + + ggml_tensor * build_graph(ggml_context * ctx) override { + // Constraint check at construction time so test fails fast on a bad shape. + GGML_ASSERT(ne_a[0] > n_rot && "(ne_a[0] > n_rot) required"); + GGML_ASSERT((ne_a[0] - n_rot) % 64 == 0 && "(ne_a[0]-n_rot) %% 64 == 0 required"); + + ggml_tensor * a = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_a.data()); + ggml_set_param(a); + ggml_set_name(a, "a"); + + ggml_tensor * out = ggml_dsv4_fp8_kv_quantize(ctx, a, n_rot); + ggml_set_name(out, "out"); + return out; + } +}; + // GGML_OP_POOL2D struct test_pool2d : public test_case { enum ggml_op_pool pool_type; @@ -8707,6 +8972,53 @@ static std::vector> make_test_cases_eval() { } } + // V4-port: dsv4_rope_tail (partial-RoPE) test cases + for (bool inverse : {false, true}) { + for (bool ff : {false, true}) { + // F32, default shape + test_cases.emplace_back(new test_dsv4_rope_tail( + GGML_TYPE_F32, {64, 8, 4, 1}, 32, GGML_ROPE_TYPE_NORMAL, 128, + 1.0f, 0.0f, 0.0f, ff, inverse)); + } + } + // Edge: larger head_dim, NEOX mode (exercises the second supported mode path). + test_cases.emplace_back(new test_dsv4_rope_tail( + GGML_TYPE_F32, {128, 16, 8, 1}, 64, GGML_ROPE_TYPE_NEOX, 256, + 1.0f, 0.0f, 0.0f, false, false)); + // (F16 dtype variant intentionally NOT registered: the reference op path + // for dsv4_rope_tail requires an F32 src0 on the backends that implement + // it, so an F16 case would surface as NOT_SUPPORTED and silently pass + // without exercising the op. F32-only here.) + + // V4-port: dsv4_hc_split_sinkhorn test cases. + // For n_hc=4 -> mix_dim = (2+4)*4 = 24. + // For n_hc=8 -> mix_dim = (2+8)*8 = 80. + test_cases.emplace_back(new test_dsv4_hc_split_sinkhorn(4, 16, 4, 1e-6f)); + test_cases.emplace_back(new test_dsv4_hc_split_sinkhorn(4, 32, 4, 1e-6f)); + test_cases.emplace_back(new test_dsv4_hc_split_sinkhorn(4, 16, 8, 1e-6f)); + test_cases.emplace_back(new test_dsv4_hc_split_sinkhorn(8, 16, 4, 1e-6f)); + + // V4-port: dsv4_hc_weighted_sum test cases (n_embd, n_hc, n_tokens). + test_cases.emplace_back(new test_dsv4_hc_weighted_sum(128, 4, 16)); + test_cases.emplace_back(new test_dsv4_hc_weighted_sum(512, 4, 32)); + test_cases.emplace_back(new test_dsv4_hc_weighted_sum(64, 8, 8)); + + // V4-port: dsv4_hc_expand test cases (n_embd, n_hc, n_tokens). + test_cases.emplace_back(new test_dsv4_hc_expand(128, 4, 16)); + test_cases.emplace_back(new test_dsv4_hc_expand(512, 4, 32)); + test_cases.emplace_back(new test_dsv4_hc_expand(64, 8, 8)); + + // V4-port: dsv4_fp8_kv_quantize test cases. + // Constraint: (ne_a[0] - n_rot) % 64 == 0. Valid examples: + // ne_a[0]=128, n_rot=64 -> prefix=64 (1 block) + // ne_a[0]=192, n_rot=64 -> prefix=128 (2 blocks) + // ne_a[0]=256, n_rot=64 -> prefix=192 (3 blocks) + // ne_a[0]=192, n_rot=128 -> prefix=64 (1 block) + test_cases.emplace_back(new test_dsv4_fp8_kv_quantize({128, 8, 4, 1}, 64)); + test_cases.emplace_back(new test_dsv4_fp8_kv_quantize({192, 8, 4, 1}, 64)); + test_cases.emplace_back(new test_dsv4_fp8_kv_quantize({256, 16, 8, 1}, 64)); + test_cases.emplace_back(new test_dsv4_fp8_kv_quantize({192, 16, 8, 1}, 128)); + for (int v : { 0, 1, 2, 3 }) { for (int dim : { 0, 1, 2, 3, }) { test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v)); diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index 3f7f3a11dfa3..671bea226b6a 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -229,6 +229,19 @@ static void compute_cossim(std::vector & tstats) { bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { GGML_UNUSED(user_data); + // imatrix only records calibration statistics from matrix multiplications + // (MUL_MAT and MUL_MAT_ID). Reject every other op early -- before the + // src0 dereference below -- so we don't crash on graph nodes that legitimately + // have null t->src[0] (e.g. leaf inputs, GGML_OP_NONE) or that are V4-specific + // DSV4 ops whose outputs aren't consumed by anything that benefits from + // imatrix data. The cb_eval callback is invoked for every scheduled node, so + // this filter also runs for graph nodes that the original code only happened + // not to crash on by accident on pre-V4 architectures. See + // docs/plans/v4-port-imatrix-diagnosis.md. + if (t->op != GGML_OP_MUL_MAT && t->op != GGML_OP_MUL_MAT_ID) { + return false; + } + const struct ggml_tensor * src0 = t->src[0]; const struct ggml_tensor * src1 = t->src[1]; std::string wname = filter_tensor_name(src0->name); @@ -239,7 +252,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection if (ask) { if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications - if (t->op != GGML_OP_MUL_MAT) return false; // why are small batches ignored (<16 tokens)? if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false; if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false; @@ -1240,6 +1252,19 @@ int main(int argc, char ** argv) { params.n_ctx = n_kv; params.n_batch = std::min(params.n_batch, n_kv); + + // V4 fix: imatrix raises n_parallel (=> cparams.n_seq_max) so it can + // fan out chunks across multiple sequences for throughput. With + // kv_unified=false (the default) this allocates per-stream KV + // buffers, which collide with V4's compressed-attention graph: V4 + // unconditionally forces n_seqs=1 for LLM_ARCH_DEEPSEEK4 and its + // compressed-attention reshape hard-codes n_stream == 1, so it + // aborts on the elements-mismatch assertion in ggml_reshape_3d when + // n_stream > 1. Forcing kv_unified=true keeps a single shared KV + // buffer (n_stream=1) without reducing imatrix's ubatch + // parallelism, and is benign for non-V4 archs. See + // docs/plans/v4-port-imatrix-diagnosis.md. + params.kv_unified = true; } g_collector.set_params(params);