Skip to content

Commit add1654

Browse files
committed
support hunyuan_v1_dense and hunyuan_v1_moe
Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
1 parent c104604 commit add1654

10 files changed

Lines changed: 292 additions & 17 deletions

README.md

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
## Latest News
2323

24+
* 05/25/2026 7.1.0-dev `main`: ✨ Added `hunyuan_v1_dense` and `hunyuan_v1_moe` model support
2425
* 05/21/2026 7.1.0-dev `main`: ✨ Added `nemotron_labs_diffusion` model support
2526
* 05/20/2026 7.1.0-dev `main`: ✨ Added `interns1`, `ovis2_5`, `ovis2_6_moe` and `ovis2_6_next` model support
2627
* 05/15/2026 7.1.0-dev `main`: ✨ Added `mimo_v2` model support
@@ -248,23 +249,24 @@ Selected public references where teams or companies explicitly mention GPT-QMode
248249

249250
## Model Support
250251

251-
| Model | | | | | | | | | |
252-
|--------------------------|---|---------------------------------|---|------------------|---|---------------------------------|---|-------------------------|---|
252+
| Model | | | | | | | | | |
253+
|--------------------------|---|---------------------------------|--|------------------|--|---------------------------------|--|------------------------|---|
253254
| Apertus || EXAONE 3/4 || Dots1 || Mistral3 || Qwen 2/3/3.5 (Next/MoE) ||
254-
| Baichuan || Falcon (H1 / Mamba) || InternLM 1/2/2.5 || Mixtral || Qwen 2/2.5/3 VL ||
255-
| Bloom || FastVLM || Kimi K2 || MobileLLM || Qwen 2.5/3 Omni ||
256-
| ChatGLM || Gemma 1-4 / 3n || Klear || MOSS || RefinedWeb ||
257-
| CodeGen || GPTBigCode || LING/RING || MPT || StableLM ||
258-
| Cohere 1-2 || GPT-Neo / NeoX || Llama 1-3.3 || Nemotron H / Omni || StarCoder2 ||
259-
| DBRX Converted || GPT-2 || Llama 3.2 VL || Nemotron Ultra / Labs-Diffusion || TeleChat2 ||
260-
| Deci || GPT-J || Llama 4 || OPT || Trinity ||
261-
| DeepSeek-V2/V3/V4/R1 || GPT-OSS || LongCat Flash || OLMo2 / LLaDA2 || Yi ||
262-
| DeepSeek-V2-Lite || Granite / Granite MoE || LongLLaMA || Ovis 1.6/2/2.5/2.6 MoE/2.6 Next || Seed-OSS ||
263-
| Dream || GRIN-MoE || Instella || Phi 1-4 || Voxtral ||
264-
| ERNIE 4.5 / MoE / VL MoE || GLM 4/4V/4.5V/4.6V/5/5.1/OCR/ASR || GLM4 MoE / Lite / 4.5V MoE || MiniCPM 3/O/V/V 4_6 || PanGu-α ||
265-
| XVERSE || Brumby || Hymba || Mistral || Qwen 1/2/3/3.5 ||
266-
| MiniMax M2 || AfMoE || Bailing-MoE || LFM2-MoE || Marin ||
267-
| InternVL Chat || Laguna || Mimo / Mimo V2 || Zamba / Zamba2 || Intern S1 ||
255+
| Baichuan || Falcon (H1 / Mamba) || InternLM 1/2/2.5 || Mixtral || Qwen 2/2.5/3 VL ||
256+
| Bloom || FastVLM || Kimi K2 || MobileLLM || Qwen 2.5/3 Omni ||
257+
| ChatGLM || Gemma 1-4 / 3n || Klear || MOSS || RefinedWeb ||
258+
| CodeGen || GPTBigCode || LING/RING || MPT || StableLM ||
259+
| Cohere 1-2 || GPT-Neo / NeoX || Llama 1-3.3 || Nemotron H / Omni || StarCoder2 ||
260+
| DBRX Converted || GPT-2 || Llama 3.2 VL || Nemotron Ultra / Labs-Diffusion || TeleChat2 ||
261+
| Deci || GPT-J || Llama 4 || OPT || Trinity ||
262+
| DeepSeek-V2/V3/V4/R1 || GPT-OSS || LongCat Flash || OLMo2 / LLaDA2 || Yi ||
263+
| DeepSeek-V2-Lite || Granite / Granite MoE || LongLLaMA || Ovis 1.6/2/2.5/2.6 MoE/2.6 Next || Seed-OSS ||
264+
| Dream || GRIN-MoE || Instella || Phi 1-4 || Voxtral ||
265+
| ERNIE 4.5 / MoE / VL MoE || GLM 4/4V/4.5V/4.6V/5/5.1/OCR/ASR || GLM4 MoE / Lite / 4.5V MoE || MiniCPM 3/O/V/V 4_6 || PanGu-α ||
266+
| XVERSE || Brumby || Hymba || Mistral || Qwen 1/2/3/3.5 ||
267+
| MiniMax M2 || AfMoE || Bailing-MoE || LFM2-MoE || Marin ||
268+
| InternVL Chat || Laguna || Mimo / Mimo V2 || Zamba / Zamba2 || Intern S1 ||
269+
| HunYuan V1 Dense / MoE || | | | | | | | |
268270

269271
Prism Bonsai GGUF checkpoints are supported for inference only through GPT-QModel's native GGUF path and internal GGUF runtime. Bonsai checkpoints load through the normal model path or repo argument and do not require the external `gguf` package. Prism model quantization is not included.
270272

gptqmodel/models/auto.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@
114114
from .definitions.granitemoehybrid import GraniteMoeHybridQModel
115115
from .definitions.grinmoe import GrinMoeQModel # noqa: E402
116116
from .definitions.hrm_text import HrmTextQModel # noqa: E402
117+
from .definitions.hunyuan_v1_dense import HunYuanDenseV1QModel # noqa: E402
118+
from .definitions.hunyuan_v1_moe import HunYuanMoEV1QModel # noqa: E402
117119
from .definitions.hymba import HymbaQModel # noqa: E402
118120
from .definitions.instella import InstellaQModel # noqa: E402
119121
from .definitions.internlm import InternLMQModel # noqa: E402
@@ -230,6 +232,8 @@
230232
"interns1": InternS1QModel,
231233
"internvl_chat": InternVLChatQModel,
232234
"hrm_text": HrmTextQModel,
235+
"hunyuan_v1_dense": HunYuanDenseV1QModel,
236+
"hunyuan_v1_moe": HunYuanMoEV1QModel,
233237
"qwen": QwenQModel,
234238
"mistral": LlamaQModel, # 100% llama clone
235239
"yi": LlamaQModel, # 100% llama clone

gptqmodel/models/definitions/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
from .gptj import GptJQModel
4242
from .grinmoe import GrinMoeQModel
4343
from .hrm_text import HrmTextQModel
44+
from .hunyuan_v1_dense import HunYuanDenseV1QModel
45+
from .hunyuan_v1_moe import HunYuanMoEV1QModel
4446
from .hymba import HymbaQModel
4547
from .instella import InstellaQModel
4648
from .internlm import InternLMQModel
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# SPDX-FileCopyrightText: 2026 ModelCloud.ai
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from .llama import LlamaQModel
5+
6+
7+
class HunYuanDenseV1QModel(LlamaQModel):
8+
"""
9+
Hunyuan Dense V1 follows a Llama-style decoder layout with per-head Q/K
10+
RMSNorm modules inside attention. Those norms are metadata/base modules for
11+
quantization and should not be replaced by quantized linear kernels.
12+
"""
13+
14+
module_tree = [
15+
"model",
16+
"layers",
17+
"#",
18+
{
19+
"input_layernorm": ("input_layernorm:!",),
20+
"self_attn": (
21+
"query_layernorm:!",
22+
"q_proj:0",
23+
"key_layernorm:!",
24+
"k_proj:0",
25+
"v_proj:0",
26+
"o_proj:1",
27+
),
28+
"post_attention_layernorm": ("post_attention_layernorm:!",),
29+
"mlp": ("gate_proj:0", "up_proj:0", "down_proj:1"),
30+
},
31+
]
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# SPDX-FileCopyrightText: 2026 ModelCloud.ai
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from gptqmodel.models.moe_lifecycle import GateUpDownMoELifecycleHooks
5+
6+
from ..base import BaseQModel
7+
8+
9+
class HunYuanMoEV1QModel(BaseQModel):
10+
dynamic_expert_index = "num_experts"
11+
12+
pre_lm_head_norm_module = "model.norm"
13+
14+
# Hunyuan MoE uses GQA, so AWQ should not force o_proj scaling shape to
15+
# match v_proj.
16+
awq_scale_optimize_shape_dependent_modules = ["self_attn.o_proj"]
17+
18+
moe_lifecycle_hooks = GateUpDownMoELifecycleHooks()
19+
moe_lifecycle_hooks.shared_expert_block_names = ["shared_mlp"]
20+
21+
module_tree = [
22+
"model",
23+
"layers",
24+
"#",
25+
{
26+
"input_layernorm": ("input_layernorm:!",),
27+
"self_attn": (
28+
"q_proj:0",
29+
"k_proj:0",
30+
"v_proj:0",
31+
"o_proj:1",
32+
"query_layernorm:!",
33+
"key_layernorm:!",
34+
),
35+
"post_attention_layernorm": ("post_attention_layernorm:!",),
36+
"mlp:moe:?": {
37+
# Router weights are tiny and are not useful weight-only targets.
38+
"gate": ("gate:!",),
39+
# The original forward runs shared_mlp before routed experts.
40+
"shared_mlp": ("gate_proj:0", "up_proj:0", "down_proj:1"),
41+
"experts:0": {
42+
"#": ("gate_proj:0", "up_proj:0", "down_proj:1"),
43+
},
44+
},
45+
},
46+
]
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
2+
# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
3+
# SPDX-License-Identifier: Apache-2.0
4+
# Contact: qubitium@modelcloud.ai, x.com/qubitium
5+
6+
from model_test import ModelTest
7+
8+
9+
class TestNemotronUltra(ModelTest):
10+
NATIVE_MODEL_ID = "/monster/data/model/HY-MT1.5-1.8B" # tencent/HY-MT1.5-1.8B
11+
EVAL_TASKS_SLOW = {
12+
"arc_challenge": {
13+
"chat_template": True,
14+
"acc": {"value": 0.3182, "floor_pct": 0.36},
15+
"acc_norm": {"value": 0.3472, "floor_pct": 0.36},
16+
},
17+
"mmlu_stem": {
18+
"chat_template": False,
19+
"acc": {
20+
"value": 0.4024,
21+
"floor_pct": 0.04,
22+
},
23+
},
24+
}
25+
EVAL_TASKS_FAST = ModelTest.derive_fast_eval_tasks(EVAL_TASKS_SLOW)
26+
27+
def test_nemotron_ultra(self):
28+
# self.quantize_and_evaluate()
29+
print(self.evaluate_model(self.SAVE_PATH))
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
2+
# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
3+
# SPDX-License-Identifier: Apache-2.0
4+
# Contact: qubitium@modelcloud.ai, x.com/qubitium
5+
6+
from model_test import ModelTest
7+
8+
9+
class TestNemotronUltra(ModelTest):
10+
NATIVE_MODEL_ID = "/monster/data/model/Hunyuan-A13B-Instruct" # tencent/Hunyuan-A13B-Instruct
11+
EVAL_TASKS_SLOW = {
12+
"arc_challenge": {
13+
"chat_template": True,
14+
"acc": {"value": 0.3182, "floor_pct": 0.36},
15+
"acc_norm": {"value": 0.3472, "floor_pct": 0.36},
16+
},
17+
"mmlu_stem": {
18+
"chat_template": False,
19+
"acc": {
20+
"value": 0.4024,
21+
"floor_pct": 0.04,
22+
},
23+
},
24+
}
25+
EVAL_TASKS_FAST = ModelTest.derive_fast_eval_tasks(EVAL_TASKS_SLOW)
26+
27+
def test_nemotron_ultra(self):
28+
self.quantize_and_evaluate()

tests/models/test_nemotron_labs_diffusion.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ class TestNemotronUltra(ModelTest):
1919
}
2020
EVAL_TASKS_FAST = ModelTest.derive_fast_eval_tasks(EVAL_TASKS_SLOW)
2121
TRUST_REMOTE_CODE = True
22-
SAVE_PATH = "./temp/Nemotron-Labs-Diffusion"
2322

2423
def test_nemotron_ultra(self):
2524
self.quantize_and_evaluate()
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# SPDX-FileCopyrightText: 2026 ModelCloud.ai
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from types import SimpleNamespace
5+
6+
import defuser
7+
from accelerate import init_empty_weights
8+
from transformers import AutoModelForCausalLM
9+
from transformers.models.hunyuan_v1_moe.configuration_hunyuan_v1_moe import HunYuanMoEV1Config
10+
11+
from gptqmodel.models import auto
12+
from gptqmodel.models.definitions.hunyuan_v1_dense import HunYuanDenseV1QModel
13+
from gptqmodel.models.definitions.hunyuan_v1_moe import HunYuanMoEV1QModel
14+
15+
16+
def test_hunyuan_v1_dense_model_type_selects_definition(monkeypatch):
17+
fake_config = SimpleNamespace(model_type="hunyuan_v1_dense")
18+
19+
monkeypatch.setattr(auto, "resolve_trust_remote_code", lambda path, trust_remote_code=False: trust_remote_code)
20+
monkeypatch.setattr(auto.AutoConfig, "from_pretrained", lambda *args, **kwargs: fake_config)
21+
22+
assert auto.check_and_get_model_definition("/tmp/hunyuan_v1_dense") is HunYuanDenseV1QModel
23+
24+
25+
def test_hunyuan_v1_dense_module_tree_skips_qk_norms():
26+
attn_modules = HunYuanDenseV1QModel.module_tree[-1]["self_attn"]
27+
28+
assert "q_proj:0" in attn_modules
29+
assert "k_proj:0" in attn_modules
30+
assert "v_proj:0" in attn_modules
31+
assert "o_proj:1" in attn_modules
32+
assert "query_layernorm:!" in attn_modules
33+
assert "key_layernorm:!" in attn_modules
34+
35+
36+
def test_hunyuan_v1_moe_model_type_selects_definition(monkeypatch):
37+
fake_config = SimpleNamespace(model_type="hunyuan_v1_moe")
38+
39+
monkeypatch.setattr(auto, "resolve_trust_remote_code", lambda path, trust_remote_code=False: trust_remote_code)
40+
monkeypatch.setattr(auto.AutoConfig, "from_pretrained", lambda *args, **kwargs: fake_config)
41+
42+
assert auto.check_and_get_model_definition("/tmp/hunyuan_v1_moe") is HunYuanMoEV1QModel
43+
44+
45+
def test_hunyuan_v1_moe_module_tree_matches_defused_experts():
46+
cfg = HunYuanMoEV1Config(
47+
vocab_size=128,
48+
hidden_size=64,
49+
intermediate_size=32,
50+
num_hidden_layers=1,
51+
num_attention_heads=4,
52+
num_key_value_heads=2,
53+
num_experts=4,
54+
moe_topk=2,
55+
head_dim=16,
56+
max_position_embeddings=128,
57+
pad_token_id=0,
58+
bos_token_id=1,
59+
eos_token_id=2,
60+
tie_word_embeddings=True,
61+
)
62+
63+
with init_empty_weights(include_buffers=True):
64+
model = AutoModelForCausalLM.from_config(cfg)
65+
66+
assert defuser.convert_model(model, cleanup_original=False) is True
67+
68+
layer = model.model.layers[0]
69+
expert = layer.mlp.experts[0]
70+
71+
assert hasattr(layer.self_attn, "query_layernorm")
72+
assert hasattr(layer.self_attn, "key_layernorm")
73+
assert hasattr(layer.mlp, "shared_mlp")
74+
assert hasattr(expert, "gate_proj")
75+
assert hasattr(expert, "up_proj")
76+
assert hasattr(expert, "down_proj")
77+
78+
attn_modules = HunYuanMoEV1QModel.module_tree[-1]["self_attn"]
79+
mlp_tree = HunYuanMoEV1QModel.module_tree[-1]["mlp:moe:?"]
80+
layer_modules = HunYuanMoEV1QModel.simple_layer_modules(
81+
model_config=cfg,
82+
quantize_config=SimpleNamespace(dynamic=None),
83+
)
84+
85+
assert "query_layernorm:!" in attn_modules
86+
assert "key_layernorm:!" in attn_modules
87+
assert "shared_mlp" in mlp_tree
88+
assert "experts:0" in mlp_tree
89+
assert ["mlp.shared_mlp.gate_proj", "mlp.shared_mlp.up_proj"] in layer_modules
90+
assert ["mlp.shared_mlp.down_proj"] in layer_modules
91+
assert any("mlp.experts.0.gate_proj" in block for block in layer_modules)
92+
assert any("mlp.experts.0.down_proj" in block for block in layer_modules)
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# SPDX-FileCopyrightText: 2026 ModelCloud.ai
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from types import SimpleNamespace
5+
6+
from transformers import AutoModel
7+
8+
from gptqmodel.models import auto
9+
from gptqmodel.models.definitions.nemotron_labs_diffusion import NemotronLabsDiffusionQModel
10+
11+
12+
def test_nemotron_labs_diffusion_model_type_selects_definition(monkeypatch):
13+
fake_config = SimpleNamespace(model_type="nemotron_labs_diffusion")
14+
15+
monkeypatch.setattr(auto, "resolve_trust_remote_code", lambda path, trust_remote_code=False: trust_remote_code)
16+
monkeypatch.setattr(auto.AutoConfig, "from_pretrained", lambda *args, **kwargs: fake_config)
17+
18+
assert auto.check_and_get_model_definition("/tmp/nemotron_labs_diffusion") is NemotronLabsDiffusionQModel
19+
20+
21+
def test_nemotron_labs_diffusion_definition_matches_remote_code_layout():
22+
layer_modules = NemotronLabsDiffusionQModel.simple_layer_modules(
23+
model_config=SimpleNamespace(),
24+
quantize_config=SimpleNamespace(dynamic=None),
25+
)
26+
flat_modules = {name for block in layer_modules for name in block}
27+
28+
assert NemotronLabsDiffusionQModel.require_trust_remote_code is True
29+
assert NemotronLabsDiffusionQModel.loader is AutoModel
30+
assert NemotronLabsDiffusionQModel.lm_head == "diffusion_head"
31+
assert NemotronLabsDiffusionQModel.pre_lm_head_norm_module == "encoder.norm"
32+
assert NemotronLabsDiffusionQModel.awq_scale_optimize_shape_dependent_modules == ["self_attn.o_proj"]
33+
assert NemotronLabsDiffusionQModel.extract_layers_node() == ["encoder.layers"]
34+
assert flat_modules == {
35+
"self_attn.q_proj",
36+
"self_attn.k_proj",
37+
"self_attn.v_proj",
38+
"self_attn.o_proj",
39+
"mlp.gate_proj",
40+
"mlp.up_proj",
41+
"mlp.down_proj",
42+
}

0 commit comments

Comments
 (0)