support hunyuan_v1_dense and hunyuan_v1_moe (#2910)

ZX-ModelCloud · web-flow · commit 28b3870eb20a · 2026-05-26T06:26:56.000+08:00
Signed-off-by: ZX-ModelCloud &lt;zx@modelcloud.ai&gt;
diff --git a/README.md b/README.md
@@ -21,6 +21,7 @@
 
 ## Latest News
 
+* 05/25/2026 7.1.0-dev `main`: ✨ Added `hunyuan_v1_dense` and `hunyuan_v1_moe` model support
 * 05/21/2026 7.1.0-dev `main`: ✨ Added `nemotron_labs_diffusion` model support
 * 05/20/2026 7.1.0-dev `main`: ✨ Added `interns1`, `ovis2_5`, `ovis2_6_moe` and `ovis2_6_next` model support
 * 05/15/2026 7.1.0-dev `main`: ✨ Added `mimo_v2` model support
@@ -248,23 +249,24 @@ Selected public references where teams or companies explicitly mention GPT-QMode
 
 ## Model Support  
 
-| Model                    |   |                                 |   |                  |   |                                 |   |                         |   |
-|--------------------------|---|---------------------------------|---|------------------|---|---------------------------------|---|-------------------------|---|
+| Model                    |   |                                 |  |                  |  |                                 |  |                        |   |
+|--------------------------|---|---------------------------------|--|------------------|--|---------------------------------|--|------------------------|---|
 | Apertus                  | ✅ | EXAONE 3/4                      | ✅ | Dots1            | ✅ | Mistral3                        | ✅ | Qwen 2/3/3.5 (Next/MoE) | ✅ |
-| Baichuan                 | ✅ | Falcon (H1 / Mamba)             | ✅ | InternLM 1/2/2.5 | ✅ | Mixtral                         | ✅ | Qwen 2/2.5/3 VL         | ✅ |
-| Bloom                    | ✅ | FastVLM                         | ✅ | Kimi K2          | ✅ | MobileLLM                       | ✅ | Qwen 2.5/3 Omni         | ✅ |
-| ChatGLM                  | ✅ | Gemma 1-4 / 3n                  | ✅ | Klear            | ✅ | MOSS                            | ✅ | RefinedWeb              | ✅ |
-| CodeGen                  | ✅ | GPTBigCode                      | ✅ | LING/RING        | ✅ | MPT                             | ✅ | StableLM                | ✅ |
-| Cohere 1-2               | ✅ | GPT-Neo / NeoX                  | ✅ | Llama 1-3.3      | ✅ | Nemotron H / Omni               | ✅ | StarCoder2              | ✅ |
-| DBRX Converted           | ✅ | GPT-2                           | ✅ | Llama 3.2 VL     | ✅ | Nemotron Ultra / Labs-Diffusion               | ✅ | TeleChat2               | ✅ |
-| Deci                     | ✅ | GPT-J                           | ✅ | Llama 4          | ✅ | OPT                             | ✅ | Trinity                 | ✅ |
-| DeepSeek-V2/V3/V4/R1        | ✅ | GPT-OSS                         | ✅ | LongCat Flash    | ✅ | OLMo2 / LLaDA2                  | ✅ | Yi                      | ✅ |
-| DeepSeek-V2-Lite         | ✅ | Granite / Granite MoE           | ✅ | LongLLaMA        | ✅ | Ovis 1.6/2/2.5/2.6 MoE/2.6 Next | ✅ | Seed-OSS                | ✅ |
-| Dream                    | ✅ | GRIN-MoE                        | ✅ | Instella         | ✅ | Phi 1-4                         | ✅ | Voxtral                 | ✅ |
-| ERNIE 4.5 / MoE / VL MoE | ✅ | GLM 4/4V/4.5V/4.6V/5/5.1/OCR/ASR | ✅ | GLM4 MoE / Lite / 4.5V MoE | ✅ | MiniCPM 3/O/V/V 4_6             | ✅ | PanGu-α                 | ✅ |
-| XVERSE                   | ✅ | Brumby                          | ✅ | Hymba            | ✅ | Mistral                         | ✅ | Qwen 1/2/3/3.5          | ✅ |
-| MiniMax M2               | ✅ | AfMoE                           | ✅ | Bailing-MoE      | ✅ | LFM2-MoE                        | ✅ | Marin                   | ✅ |
-| InternVL Chat            | ✅ | Laguna                          | ✅ | Mimo / Mimo V2   | ✅ | Zamba / Zamba2                  | ✅ | Intern S1               | ✅  |
+| Baichuan                 | ✅ | Falcon (H1 / Mamba)             | ✅ | InternLM 1/2/2.5 | ✅ | Mixtral                         | ✅ | Qwen 2/2.5/3 VL        | ✅ |
+| Bloom                    | ✅ | FastVLM                         | ✅ | Kimi K2          | ✅ | MobileLLM                       | ✅ | Qwen 2.5/3 Omni        | ✅ |
+| ChatGLM                  | ✅ | Gemma 1-4 / 3n                  | ✅ | Klear            | ✅ | MOSS                            | ✅ | RefinedWeb             | ✅ |
+| CodeGen                  | ✅ | GPTBigCode                      | ✅ | LING/RING        | ✅ | MPT                             | ✅ | StableLM               | ✅ |
+| Cohere 1-2               | ✅ | GPT-Neo / NeoX                  | ✅ | Llama 1-3.3      | ✅ | Nemotron H / Omni               | ✅ | StarCoder2             | ✅ |
+| DBRX Converted           | ✅ | GPT-2                           | ✅ | Llama 3.2 VL     | ✅ | Nemotron Ultra / Labs-Diffusion               | ✅ | TeleChat2              | ✅ |
+| Deci                     | ✅ | GPT-J                           | ✅ | Llama 4          | ✅ | OPT                             | ✅ | Trinity                | ✅ |
+| DeepSeek-V2/V3/V4/R1     | ✅ | GPT-OSS                         | ✅ | LongCat Flash    | ✅ | OLMo2 / LLaDA2                  | ✅ | Yi                     | ✅ |
+| DeepSeek-V2-Lite         | ✅ | Granite / Granite MoE           | ✅ | LongLLaMA        | ✅ | Ovis 1.6/2/2.5/2.6 MoE/2.6 Next | ✅ | Seed-OSS               | ✅ |
+| Dream                    | ✅ | GRIN-MoE                        | ✅ | Instella         | ✅ | Phi 1-4                         | ✅ | Voxtral                | ✅ |
+| ERNIE 4.5 / MoE / VL MoE | ✅ | GLM 4/4V/4.5V/4.6V/5/5.1/OCR/ASR | ✅ | GLM4 MoE / Lite / 4.5V MoE | ✅ | MiniCPM 3/O/V/V 4_6             | ✅ | PanGu-α                | ✅ |
+| XVERSE                   | ✅ | Brumby                          | ✅ | Hymba            | ✅ | Mistral                         | ✅ | Qwen 1/2/3/3.5         | ✅ |
+| MiniMax M2               | ✅ | AfMoE                           | ✅ | Bailing-MoE      | ✅ | LFM2-MoE                        | ✅ | Marin                  | ✅ |
+| InternVL Chat            | ✅ | Laguna                          | ✅ | Mimo / Mimo V2   | ✅ | Zamba / Zamba2                  | ✅ | Intern S1              | ✅ |
+| HunYuan V1 Dense / MoE   | ✅ |                           |  |    |  |                  |  |                |   |
 
 Prism Bonsai GGUF checkpoints are supported for inference only through GPT-QModel's native GGUF path and internal GGUF runtime. Bonsai checkpoints load through the normal model path or repo argument and do not require the external `gguf` package. Prism model quantization is not included.
 
diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py
@@ -114,6 +114,8 @@
 from .definitions.granitemoehybrid import GraniteMoeHybridQModel
 from .definitions.grinmoe import GrinMoeQModel  # noqa: E402
 from .definitions.hrm_text import HrmTextQModel  # noqa: E402
+from .definitions.hunyuan_v1_dense import HunYuanDenseV1QModel  # noqa: E402
+from .definitions.hunyuan_v1_moe import HunYuanMoEV1QModel  # noqa: E402
 from .definitions.hymba import HymbaQModel  # noqa: E402
 from .definitions.instella import InstellaQModel  # noqa: E402
 from .definitions.internlm import InternLMQModel  # noqa: E402
@@ -230,6 +232,8 @@
     "interns1": InternS1QModel,
     "internvl_chat": InternVLChatQModel,
     "hrm_text": HrmTextQModel,
+    "hunyuan_v1_dense": HunYuanDenseV1QModel,
+    "hunyuan_v1_moe": HunYuanMoEV1QModel,
     "qwen": QwenQModel,
     "mistral": LlamaQModel, # 100% llama clone
     "yi": LlamaQModel, # 100% llama clone
diff --git a/gptqmodel/models/definitions/__init__.py b/gptqmodel/models/definitions/__init__.py
@@ -41,6 +41,8 @@
 from .gptj import GptJQModel
 from .grinmoe import GrinMoeQModel
 from .hrm_text import HrmTextQModel
+from .hunyuan_v1_dense import HunYuanDenseV1QModel
+from .hunyuan_v1_moe import HunYuanMoEV1QModel
 from .hymba import HymbaQModel
 from .instella import InstellaQModel
 from .internlm import InternLMQModel
diff --git a/gptqmodel/models/definitions/hunyuan_v1_dense.py b/gptqmodel/models/definitions/hunyuan_v1_dense.py
@@ -0,0 +1,31 @@
+# SPDX-FileCopyrightText: 2026 ModelCloud.ai
+# SPDX-License-Identifier: Apache-2.0
+
+from .llama import LlamaQModel
+
+
+class HunYuanDenseV1QModel(LlamaQModel):
+    """
+    Hunyuan Dense V1 follows a Llama-style decoder layout with per-head Q/K
+    RMSNorm modules inside attention. Those norms are metadata/base modules for
+    quantization and should not be replaced by quantized linear kernels.
+    """
+
+    module_tree = [
+        "model",
+        "layers",
+        "#",
+        {
+            "input_layernorm": ("input_layernorm:!",),
+            "self_attn": (
+                "query_layernorm:!",
+                "q_proj:0",
+                "key_layernorm:!",
+                "k_proj:0",
+                "v_proj:0",
+                "o_proj:1",
+            ),
+            "post_attention_layernorm": ("post_attention_layernorm:!",),
+            "mlp": ("gate_proj:0", "up_proj:0", "down_proj:1"),
+        },
+    ]
diff --git a/gptqmodel/models/definitions/hunyuan_v1_moe.py b/gptqmodel/models/definitions/hunyuan_v1_moe.py
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: 2026 ModelCloud.ai
+# SPDX-License-Identifier: Apache-2.0
+
+from gptqmodel.models.moe_lifecycle import GateUpDownMoELifecycleHooks
+
+from ..base import BaseQModel
+
+
+class HunYuanMoEV1QModel(BaseQModel):
+    dynamic_expert_index = "num_experts"
+
+    pre_lm_head_norm_module = "model.norm"
+
+    # Hunyuan MoE uses GQA, so AWQ should not force o_proj scaling shape to
+    # match v_proj.
+    awq_scale_optimize_shape_dependent_modules = ["self_attn.o_proj"]
+
+    moe_lifecycle_hooks = GateUpDownMoELifecycleHooks()
+    moe_lifecycle_hooks.shared_expert_block_names = ["shared_mlp"]
+
+    module_tree = [
+        "model",
+        "layers",
+        "#",
+        {
+            "input_layernorm": ("input_layernorm:!",),
+            "self_attn": (
+                "q_proj:0",
+                "k_proj:0",
+                "v_proj:0",
+                "o_proj:1",
+                "query_layernorm:!",
+                "key_layernorm:!",
+            ),
+            "post_attention_layernorm": ("post_attention_layernorm:!",),
+            "mlp:moe:?": {
+                # Router weights are tiny and are not useful weight-only targets.
+                "gate": ("gate:!",),
+                # The original forward runs shared_mlp before routed experts.
+                "shared_mlp": ("gate_proj:0", "up_proj:0", "down_proj:1"),
+                "experts:0": {
+                    "#": ("gate_proj:0", "up_proj:0", "down_proj:1"),
+                },
+            },
+        },
+    ]
diff --git a/tests/models/test_hunyuan_v1_dense.py b/tests/models/test_hunyuan_v1_dense.py
@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
+# SPDX-License-Identifier: Apache-2.0
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+
+from model_test import ModelTest
+
+
+class TestNemotronUltra(ModelTest):
+    NATIVE_MODEL_ID = "/monster/data/model/HY-MT1.5-1.8B" # tencent/HY-MT1.5-1.8B
+    EVAL_TASKS_SLOW = {
+        "arc_challenge": {
+            "chat_template": True,
+            "acc": {"value": 0.3182, "floor_pct": 0.36},
+            "acc_norm": {"value": 0.3472, "floor_pct": 0.36},
+        },
+        "mmlu_stem": {
+            "chat_template": False,
+            "acc": {
+                "value": 0.4024,
+                "floor_pct": 0.04,
+            },
+        },
+    }
+    EVAL_TASKS_FAST = ModelTest.derive_fast_eval_tasks(EVAL_TASKS_SLOW)
+
+    def test_nemotron_ultra(self):
+        # self.quantize_and_evaluate()
+        print(self.evaluate_model(self.SAVE_PATH))
diff --git a/tests/models/test_hunyuan_v1_moe.py b/tests/models/test_hunyuan_v1_moe.py
@@ -0,0 +1,28 @@
+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
+# SPDX-License-Identifier: Apache-2.0
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+
+from model_test import ModelTest
+
+
+class TestNemotronUltra(ModelTest):
+    NATIVE_MODEL_ID = "/monster/data/model/Hunyuan-A13B-Instruct" # tencent/Hunyuan-A13B-Instruct
+    EVAL_TASKS_SLOW = {
+        "arc_challenge": {
+            "chat_template": True,
+            "acc": {"value": 0.3182, "floor_pct": 0.36},
+            "acc_norm": {"value": 0.3472, "floor_pct": 0.36},
+        },
+        "mmlu_stem": {
+            "chat_template": False,
+            "acc": {
+                "value": 0.4024,
+                "floor_pct": 0.04,
+            },
+        },
+    }
+    EVAL_TASKS_FAST = ModelTest.derive_fast_eval_tasks(EVAL_TASKS_SLOW)
+
+    def test_nemotron_ultra(self):
+        self.quantize_and_evaluate()
diff --git a/tests/models/test_nemotron_labs_diffusion.py b/tests/models/test_nemotron_labs_diffusion.py
@@ -19,7 +19,6 @@ class TestNemotronUltra(ModelTest):
     }
     EVAL_TASKS_FAST = ModelTest.derive_fast_eval_tasks(EVAL_TASKS_SLOW)
     TRUST_REMOTE_CODE = True
-    SAVE_PATH = "./temp/Nemotron-Labs-Diffusion"
 
     def test_nemotron_ultra(self):
         self.quantize_and_evaluate()
diff --git a/tests/test_hunyuan_v1_dense_support.py b/tests/test_hunyuan_v1_dense_support.py
@@ -0,0 +1,92 @@
+# SPDX-FileCopyrightText: 2026 ModelCloud.ai
+# SPDX-License-Identifier: Apache-2.0
+
+from types import SimpleNamespace
+
+import defuser
+from accelerate import init_empty_weights
+from transformers import AutoModelForCausalLM
+from transformers.models.hunyuan_v1_moe.configuration_hunyuan_v1_moe import HunYuanMoEV1Config
+
+from gptqmodel.models import auto
+from gptqmodel.models.definitions.hunyuan_v1_dense import HunYuanDenseV1QModel
+from gptqmodel.models.definitions.hunyuan_v1_moe import HunYuanMoEV1QModel
+
+
+def test_hunyuan_v1_dense_model_type_selects_definition(monkeypatch):
+    fake_config = SimpleNamespace(model_type="hunyuan_v1_dense")
+
+    monkeypatch.setattr(auto, "resolve_trust_remote_code", lambda path, trust_remote_code=False: trust_remote_code)
+    monkeypatch.setattr(auto.AutoConfig, "from_pretrained", lambda *args, **kwargs: fake_config)
+
+    assert auto.check_and_get_model_definition("/tmp/hunyuan_v1_dense") is HunYuanDenseV1QModel
+
+
+def test_hunyuan_v1_dense_module_tree_skips_qk_norms():
+    attn_modules = HunYuanDenseV1QModel.module_tree[-1]["self_attn"]
+
+    assert "q_proj:0" in attn_modules
+    assert "k_proj:0" in attn_modules
+    assert "v_proj:0" in attn_modules
+    assert "o_proj:1" in attn_modules
+    assert "query_layernorm:!" in attn_modules
+    assert "key_layernorm:!" in attn_modules
+
+
+def test_hunyuan_v1_moe_model_type_selects_definition(monkeypatch):
+    fake_config = SimpleNamespace(model_type="hunyuan_v1_moe")
+
+    monkeypatch.setattr(auto, "resolve_trust_remote_code", lambda path, trust_remote_code=False: trust_remote_code)
+    monkeypatch.setattr(auto.AutoConfig, "from_pretrained", lambda *args, **kwargs: fake_config)
+
+    assert auto.check_and_get_model_definition("/tmp/hunyuan_v1_moe") is HunYuanMoEV1QModel
+
+
+def test_hunyuan_v1_moe_module_tree_matches_defused_experts():
+    cfg = HunYuanMoEV1Config(
+        vocab_size=128,
+        hidden_size=64,
+        intermediate_size=32,
+        num_hidden_layers=1,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        num_experts=4,
+        moe_topk=2,
+        head_dim=16,
+        max_position_embeddings=128,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=True,
+    )
+
+    with init_empty_weights(include_buffers=True):
+        model = AutoModelForCausalLM.from_config(cfg)
+
+    assert defuser.convert_model(model, cleanup_original=False) is True
+
+    layer = model.model.layers[0]
+    expert = layer.mlp.experts[0]
+
+    assert hasattr(layer.self_attn, "query_layernorm")
+    assert hasattr(layer.self_attn, "key_layernorm")
+    assert hasattr(layer.mlp, "shared_mlp")
+    assert hasattr(expert, "gate_proj")
+    assert hasattr(expert, "up_proj")
+    assert hasattr(expert, "down_proj")
+
+    attn_modules = HunYuanMoEV1QModel.module_tree[-1]["self_attn"]
+    mlp_tree = HunYuanMoEV1QModel.module_tree[-1]["mlp:moe:?"]
+    layer_modules = HunYuanMoEV1QModel.simple_layer_modules(
+        model_config=cfg,
+        quantize_config=SimpleNamespace(dynamic=None),
+    )
+
+    assert "query_layernorm:!" in attn_modules
+    assert "key_layernorm:!" in attn_modules
+    assert "shared_mlp" in mlp_tree
+    assert "experts:0" in mlp_tree
+    assert ["mlp.shared_mlp.gate_proj", "mlp.shared_mlp.up_proj"] in layer_modules
+    assert ["mlp.shared_mlp.down_proj"] in layer_modules
+    assert any("mlp.experts.0.gate_proj" in block for block in layer_modules)
+    assert any("mlp.experts.0.down_proj" in block for block in layer_modules)
diff --git a/tests/test_nemotron_labs_diffusion_support.py b/tests/test_nemotron_labs_diffusion_support.py
@@ -0,0 +1,42 @@
+# SPDX-FileCopyrightText: 2026 ModelCloud.ai
+# SPDX-License-Identifier: Apache-2.0
+
+from types import SimpleNamespace
+
+from transformers import AutoModel
+
+from gptqmodel.models import auto
+from gptqmodel.models.definitions.nemotron_labs_diffusion import NemotronLabsDiffusionQModel
+
+
+def test_nemotron_labs_diffusion_model_type_selects_definition(monkeypatch):
+    fake_config = SimpleNamespace(model_type="nemotron_labs_diffusion")
+
+    monkeypatch.setattr(auto, "resolve_trust_remote_code", lambda path, trust_remote_code=False: trust_remote_code)
+    monkeypatch.setattr(auto.AutoConfig, "from_pretrained", lambda *args, **kwargs: fake_config)
+
+    assert auto.check_and_get_model_definition("/tmp/nemotron_labs_diffusion") is NemotronLabsDiffusionQModel
+
+
+def test_nemotron_labs_diffusion_definition_matches_remote_code_layout():
+    layer_modules = NemotronLabsDiffusionQModel.simple_layer_modules(
+        model_config=SimpleNamespace(),
+        quantize_config=SimpleNamespace(dynamic=None),
+    )
+    flat_modules = {name for block in layer_modules for name in block}
+
+    assert NemotronLabsDiffusionQModel.require_trust_remote_code is True
+    assert NemotronLabsDiffusionQModel.loader is AutoModel
+    assert NemotronLabsDiffusionQModel.lm_head == "diffusion_head"
+    assert NemotronLabsDiffusionQModel.pre_lm_head_norm_module == "encoder.norm"
+    assert NemotronLabsDiffusionQModel.awq_scale_optimize_shape_dependent_modules == ["self_attn.o_proj"]
+    assert NemotronLabsDiffusionQModel.extract_layers_node() == ["encoder.layers"]
+    assert flat_modules == {
+        "self_attn.q_proj",
+        "self_attn.k_proj",
+        "self_attn.v_proj",
+        "self_attn.o_proj",
+        "mlp.gate_proj",
+        "mlp.up_proj",
+        "mlp.down_proj",
+    }

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,6 @@ class TestNemotronUltra(ModelTest):`
`19`	`19`	`}`
`20`	`20`	`EVAL_TASKS_FAST = ModelTest.derive_fast_eval_tasks(EVAL_TASKS_SLOW)`
`21`	`21`	`TRUST_REMOTE_CODE = True`
`22`		`- SAVE_PATH = "./temp/Nemotron-Labs-Diffusion"`
`23`	`22`
`24`	`23`	`def test_nemotron_ultra(self):`
`25`	`24`	`self.quantize_and_evaluate()`