Skip to content

Commit 37c14f6

Browse files
authored
Add Qwen 3.6 MoE quantization regressions (#2752)
1 parent 28b2ec6 commit 37c14f6

2 files changed

Lines changed: 161 additions & 0 deletions

File tree

tests/models/test_qwen3_6_moe.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# SPDX-FileCopyrightText: 2026 ModelCloud.ai
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from __future__ import annotations
5+
6+
import os
7+
8+
from model_test import ModelTest
9+
10+
from gptqmodel.quantization.config import ExpertsRoutingOverride, Fallback, MoEConfig, VramStrategy
11+
12+
13+
LAST_FOUR_ONLY_NEGATIVE_MATCH = r"^model\.language_model\.layers\.(?:[0-9]|[1-2][0-9]|3[0-5])(?:\.|$)"
14+
15+
16+
def _resolve_qwen3_6_moe_model_path() -> str:
17+
"""Prefer the user-requested local-dir path, but allow an explicit override for repro runs."""
18+
19+
override = os.environ.get("GPTQMODEL_QWEN3_6_MOE_MODEL_PATH")
20+
if override:
21+
return override
22+
23+
requested = "/moonster/data/model/Qwen3.6-35B-A3B"
24+
legacy_fallback = "/monster/data/model/Qwen3.6-35B-A3B"
25+
if os.path.isdir(requested):
26+
return requested
27+
if os.path.isdir(legacy_fallback):
28+
return legacy_fallback
29+
return requested
30+
31+
32+
class TestQwen3_6Moe(ModelTest):
33+
"""Fast 3-A100 compat regression for the Qwen 3.6 MoE checkpoint released on the Qwen hub."""
34+
35+
FALLBACK = Fallback()
36+
NATIVE_MODEL_ID = _resolve_qwen3_6_moe_model_path()
37+
DATASET_SIZE = 16
38+
DATASET_CONCAT_SIZE = 1024
39+
QUANT_BATCH_SIZE = 1
40+
EVAL_BATCH_SIZE = "auto"
41+
# Keep post-quant validation spread across the visible A100 pool instead of forcing a single-GPU reload.
42+
EVAL_SINGLE_GPU = False
43+
# The native checkpoint has 40 decoder layers; skip 0-35 so only the last four are quantized.
44+
DYNAMIC = {
45+
f"-:{LAST_FOUR_ONLY_NEGATIVE_MATCH}": {},
46+
}
47+
DENSE_VRAM_STRATEGY = VramStrategy.EXCLUSIVE
48+
# Keep the dense shell on the first visible device and spread expert work across the rest.
49+
DENSE_VRAM_STRATEGY_DEVICES = ["cuda:0"]
50+
MOE_VRAM_STRATEGY = VramStrategy.BALANCED
51+
# The mixed RTX 4090 slot on this host OOMs during late replay for this 35B MoE path,
52+
# so keep the quantization pool on the A100-class devices.
53+
MOE_VRAM_STRATEGY_DEVICES = ["cuda:1", "cuda:2"]
54+
# Fast compat mode already trims the decoder stack; keep routing aligned with the checkpoint default.
55+
MOE_CONFIG = MoEConfig(routing=ExpertsRoutingOverride(num_experts_per_tok=8))
56+
EVAL_TASKS_FAST = {
57+
"gsm8k_platinum_cot": {
58+
"chat_template": False,
59+
# Evaluate the already-loaded post-quant wrapper so the three-A100 placement survives into Evalution.
60+
"evalution_batch_size": 4,
61+
"evalution_model_args": {
62+
"dtype": "bfloat16",
63+
"attn_implementation": "paged|flash_attention_2",
64+
"device_map": "auto",
65+
},
66+
"evalution_suite_kwargs": {
67+
"batch_size": 4,
68+
"max_new_tokens": 256,
69+
"stream": True,
70+
"max_rows": 8,
71+
},
72+
"acc,num": {
73+
"value": {
74+
# Recorded on 2026-04-16 with three visible A100s (dense=cuda:0, MoE=cuda:1,cuda:2).
75+
"A100": 1.0,
76+
},
77+
"floor_pct": 0.04,
78+
"ceil_pct": 1.0,
79+
},
80+
},
81+
}
82+
EVAL_TASKS_SLOW = EVAL_TASKS_FAST
83+
84+
def test_qwen3_6_moe(self):
85+
self.quantize_and_evaluate()

tests/test_qwen3_6_moe_support.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# SPDX-FileCopyrightText: 2026 ModelCloud.ai
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from __future__ import annotations
5+
6+
import json
7+
8+
from transformers import AutoConfig
9+
10+
from gptqmodel.models.auto import check_and_get_model_definition
11+
from gptqmodel.models.definitions.qwen3_5_moe import Qwen3_5_MoeQModel
12+
13+
14+
def test_qwen3_6_moe_reuses_the_qwen3_5_moe_transformers_definition(tmp_path):
15+
"""Guard the real Qwen 3.6 MoE config shape shipped on the Hub."""
16+
17+
layer_types = ["linear_attention", "linear_attention", "linear_attention", "full_attention"] * 10
18+
config = {
19+
"architectures": ["Qwen3_5MoeForConditionalGeneration"],
20+
"image_token_id": 248056,
21+
"model_type": "qwen3_5_moe",
22+
"text_config": {
23+
"dtype": "bfloat16",
24+
"full_attention_interval": 4,
25+
"hidden_size": 2048,
26+
"layer_types": layer_types,
27+
"max_position_embeddings": 262144,
28+
"model_type": "qwen3_5_moe_text",
29+
"moe_intermediate_size": 512,
30+
"num_attention_heads": 16,
31+
"num_experts": 256,
32+
"num_experts_per_tok": 8,
33+
"num_hidden_layers": 40,
34+
"num_key_value_heads": 2,
35+
"partial_rotary_factor": 0.25,
36+
"rope_parameters": {
37+
"mrope_interleaved": True,
38+
"mrope_section": [11, 11, 10],
39+
"partial_rotary_factor": 0.25,
40+
"rope_theta": 10000000,
41+
"rope_type": "default",
42+
},
43+
"shared_expert_intermediate_size": 512,
44+
"tie_word_embeddings": False,
45+
"use_cache": True,
46+
"vocab_size": 248320,
47+
},
48+
"tie_word_embeddings": False,
49+
"transformers_version": "4.57.1",
50+
"video_token_id": 248057,
51+
"vision_config": {
52+
"deepstack_visual_indexes": [],
53+
"depth": 27,
54+
"hidden_size": 1152,
55+
"in_channels": 3,
56+
"intermediate_size": 4304,
57+
"model_type": "qwen3_5_moe",
58+
"num_heads": 16,
59+
"num_position_embeddings": 2304,
60+
"out_hidden_size": 2048,
61+
"patch_size": 16,
62+
"spatial_merge_size": 2,
63+
"temporal_patch_size": 2,
64+
},
65+
"vision_end_token_id": 248054,
66+
"vision_start_token_id": 248053,
67+
}
68+
model_dir = tmp_path / "qwen3_6_moe"
69+
model_dir.mkdir()
70+
(model_dir / "config.json").write_text(json.dumps(config), encoding="utf-8")
71+
72+
resolved_config = AutoConfig.from_pretrained(model_dir, trust_remote_code=False)
73+
model_definition = check_and_get_model_definition(model_dir, trust_remote_code=False)
74+
75+
assert type(resolved_config).__name__ == "Qwen3_5MoeConfig"
76+
assert model_definition is Qwen3_5_MoeQModel

0 commit comments

Comments
 (0)