[MODEL] support deepseek_v4 (#2877)

ZX-ModelCloud · github-code-quality[bot] · web-flow · commit ed42bc582c9a · 2026-05-15T15:25:00.000+08:00
* LazyTurtle support deepseek_v4

Signed-off-by: ZX-ModelCloud &lt;zx@modelcloud.ai&gt;

* support deepseek_v4's WeightConverter

Signed-off-by: ZX-ModelCloud &lt;zx@modelcloud.ai&gt;

* support deepseek_v4

Signed-off-by: ZX-ModelCloud &lt;zx@modelcloud.ai&gt;

* update README

Signed-off-by: ZX-ModelCloud &lt;zx@modelcloud.ai&gt;

* Potential fix for pull request finding 'Unused import'

Co-authored-by: Copilot Autofix powered by AI &lt;223894421+github-code-quality[bot]@users.noreply.github.com&gt;

---------

Signed-off-by: ZX-ModelCloud &lt;zx@modelcloud.ai&gt;
Co-authored-by: Copilot Autofix powered by AI &lt;223894421+github-code-quality[bot]@users.noreply.github.com&gt;
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@
 
 ## Latest News
 
-* 05/13/2026 7.1.0-dev `main`: ✨ Added `minicpmv_4_6` model support
+* 05/13/2026 7.1.0-dev `main`: ✨ Added `minicpmv_4_6` and `DeepSeek V4` model support
 * 05/07/2026 7.1.0-dev `main`: ✨ Added `GLM-4.5V`, `GLM-4.6V`, `Zamba` and `Zamba2` model support
 * 04/29/2026 7.1.0-dev `main`: ✨ Added PoolSideAI `Laguna` model support for fused Laguna MoE checkpoints. Added `ERNIE 4.5 VL MoE`, `Ling-2.6-flash` and NVIDIA `Nemotron 3 Nano Omni` model support.
 * 04/28/2026 [7.0.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v7.0.0): 🚀 Added Huawei Ascend NPU support through native torch kernels for GPTQ, AWQ, ParoQuant, GGUF, QQQ, and EXL3. Added `internvl_chat`, `gemma3n`, `GLM-OCR`, `GLM-ASR`, and `falcon_mamba` model support.
@@ -254,7 +254,7 @@ Selected public references where teams or companies explicitly mention GPT-QMode
 | Cohere 1-2               | ✅ | GPT-Neo / NeoX                  | ✅ | Llama 1-3.3      | ✅ | Nemotron H / Omni   | ✅ | StarCoder2              | ✅ |
 | DBRX Converted           | ✅ | GPT-2                           | ✅ | Llama 3.2 VL     | ✅ | Nemotron Ultra      | ✅ | TeleChat2               | ✅ |
 | Deci                     | ✅ | GPT-J                           | ✅ | Llama 4          | ✅ | OPT                 | ✅ | Trinity                 | ✅ |
-| DeepSeek-V2/V3/R1        | ✅ | GPT-OSS                         | ✅ | LongCat Flash    | ✅ | OLMo2 / LLaDA2      | ✅ | Yi                      | ✅ |
+| DeepSeek-V2/V3/V4/R1        | ✅ | GPT-OSS                         | ✅ | LongCat Flash    | ✅ | OLMo2 / LLaDA2      | ✅ | Yi                      | ✅ |
 | DeepSeek-V2-Lite         | ✅ | Granite / Granite MoE           | ✅ | LongLLaMA        | ✅ | Ovis 1.6/2          | ✅ | Seed-OSS                | ✅ |
 | Dream                    | ✅ | GRIN-MoE                        | ✅ | Instella         | ✅ | Phi 1-4             | ✅ | Voxtral                 | ✅ |
 | ERNIE 4.5 / MoE / VL MoE | ✅ | GLM 4/4V/4.5V/4.6V/5/5.1/OCR/ASR | ✅ | GLM4 MoE / Lite / 4.5V MoE | ✅ | MiniCPM 3/O/V/V 4_6 | ✅ | PanGu-α                 | ✅ |
diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py
@@ -82,6 +82,7 @@
 from .definitions.decilm import DeciLMQModel  # noqa: E402
 from .definitions.deepseek_v2 import DeepSeekV2QModel  # noqa: E402
 from .definitions.deepseek_v3 import DeepSeekV3QModel  # noqa: E402
+from .definitions.deepseek_v4 import DeepSeekV4QModel  # noqa: E402
 from .definitions.dots1 import Dots1QModel  # noqa: E402
 from .definitions.dream import DreamQModel  # noqa: E402
 from .definitions.ernie4_5 import Ernie4_5QModel  # noqa: E402
@@ -264,6 +265,7 @@
     "dbrx_converted": DbrxConvertedQModel,
     "deepseek_v2": DeepSeekV2QModel,
     "deepseek_v3": DeepSeekV3QModel,
+    "deepseek_v4": DeepSeekV4QModel,
     "dots1": Dots1QModel,
     "exaone": ExaOneQModel,
     "exaone4": Exaone4QModel,
diff --git a/gptqmodel/models/definitions/__init__.py b/gptqmodel/models/definitions/__init__.py
@@ -18,6 +18,7 @@
 from .decilm import DeciLMQModel
 from .deepseek_v2 import DeepSeekV2QModel
 from .deepseek_v3 import DeepSeekV3QModel
+from .deepseek_v4 import DeepSeekV4QModel
 from .dots1 import Dots1QModel
 from .dream import DreamQModel
 from .exaone import ExaOneQModel
diff --git a/gptqmodel/models/definitions/deepseek_v4.py b/gptqmodel/models/definitions/deepseek_v4.py
@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: 2026 ModelCloud.ai
+# SPDX-FileCopyrightText: 2026 qubitium@modelcloud.ai
+# SPDX-License-Identifier: Apache-2.0
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+
+from .deepseek_v3 import DeepSeekV3QModel
+
+
+class DeepSeekV4QModel(DeepSeekV3QModel):
+    dynamic_expert_index = "n_routed_experts"
+    rotary_embedding = "model.rotary_emb"
+    module_tree = [
+        "model",
+        "layers",
+        "#",
+        {
+            "input_layernorm": ("input_layernorm:!",),
+            "self_attn": (
+                "q_a_norm:!",
+                "q_a_proj:0",
+                "q_b_norm:!",
+                "q_b_proj:0",
+                "o_a_proj:!",
+                "o_b_proj:1",
+                "kv_norm:!",
+                "kv_proj:2",
+            ),
+            "post_attention_layernorm": ("post_attention_layernorm:!",),
+            "mlp:moe": {
+                "gate": ("gate:!",),
+                "experts": {
+                    "#": ("gate_proj:0", "up_proj:0", "down_proj:1"),
+                },
+                "shared_experts": ("gate_proj:0", "up_proj:0", "down_proj:1"),
+            },
+        },
+    ]
+
+
+
+__all__ = ["DeepSeekV4QModel"]
diff --git a/gptqmodel/utils/structure.py b/gptqmodel/utils/structure.py
diff --git a/tests/models/test_deepseek_v4.py b/tests/models/test_deepseek_v4.py
@@ -0,0 +1,33 @@
+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
+# SPDX-License-Identifier: Apache-2.0
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+from model_test import ModelTest
+
+from gptqmodel import BACKEND
+
+
+class TestDeepseekV4(ModelTest):
+    NATIVE_MODEL_ID = "/monster/data/model/DeepSeek-V4-Flash-BF16" # "deepseek-ai/DeepSeek-V4-Flash-BF16"
+    NATIVE_ARC_CHALLENGE_ACC = 0.4753
+    NATIVE_ARC_CHALLENGE_ACC_NORM = 0.4855
+    NATIVE_ARC_CHALLENGE_ACC_SLOW = NATIVE_ARC_CHALLENGE_ACC
+    NATIVE_ARC_CHALLENGE_ACC_NORM_SLOW = NATIVE_ARC_CHALLENGE_ACC_NORM
+    NATIVE_ARC_CHALLENGE_ACC_FAST = NATIVE_ARC_CHALLENGE_ACC_SLOW
+    NATIVE_ARC_CHALLENGE_ACC_NORM_FAST = NATIVE_ARC_CHALLENGE_ACC_NORM_SLOW
+    TRUST_REMOTE_CODE = True
+    EVAL_TASKS_SLOW = {
+        "arc_challenge": {
+            "chat_template": True,
+            "acc": {"value": NATIVE_ARC_CHALLENGE_ACC},
+            "acc_norm": {"value": NATIVE_ARC_CHALLENGE_ACC_NORM},
+        },
+    }
+    EVAL_TASKS_FAST = ModelTest.derive_fast_eval_tasks(EVAL_TASKS_SLOW)
+    LOAD_BACKEND = BACKEND.AUTO
+    USE_FLASH_ATTN = False
+    MODEL_COMPAT_FAST_LAYER_POSITION = "first"
+
+    def test_deepseekv2lite(self):
+        self.quantize_and_evaluate()
+
diff --git a/tests/test_deepseek_v4_support.py b/tests/test_deepseek_v4_support.py
@@ -0,0 +1,32 @@
+from types import SimpleNamespace
+
+from gptqmodel.models import auto
+from gptqmodel.models.definitions.deepseek_v4 import DeepSeekV4QModel
+
+
+def test_deepseek_v4_model_type_selects_definition(monkeypatch):
+    fake_config = SimpleNamespace(model_type="deepseek_v4")
+
+    monkeypatch.setattr(auto, "resolve_trust_remote_code", lambda path, trust_remote_code=False: trust_remote_code)
+    monkeypatch.setattr(auto.AutoConfig, "from_pretrained", lambda *args, **kwargs: fake_config)
+
+    assert auto.check_and_get_model_definition("/tmp/deepseek-v4") is DeepSeekV4QModel
+
+
+def test_deepseek_v4_module_tree_matches_v4_attention_and_fused_experts():
+    layer_modules = DeepSeekV4QModel.simple_layer_modules(
+        model_config=SimpleNamespace(n_routed_experts=256),
+        quantize_config=SimpleNamespace(dynamic=None),
+    )
+    flat_modules = {name for block in layer_modules for name in block}
+
+    assert "self_attn.q_a_proj" in flat_modules
+    assert "self_attn.q_b_proj" in flat_modules
+    assert "self_attn.kv_proj" in flat_modules
+    assert "self_attn.o_b_proj" in flat_modules
+    # grouped projection must stay native and should not be part of quant blocks
+    assert "self_attn.o_a_proj" not in flat_modules
+    assert "mlp.experts.99.gate_proj" in flat_modules
+    assert "mlp.experts.99.up_proj" in flat_modules
+    assert "mlp.experts.99.down_proj" in flat_modules
+    assert "mlp.shared_experts.gate_proj" in flat_modules
diff --git a/tests/test_lazy_turtle_conversion_mapping.py b/tests/test_lazy_turtle_conversion_mapping.py