From 48d4ffdf22c63a9622e40f7399c054117f85527b Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 2 Apr 2026 15:20:59 -0700 Subject: [PATCH 1/2] Fix Qwen3-VL MoE export by refactoring experts to per-expert module containers Qwen3VLMoeTextExperts stored expert weights as flat ModuleLists (gate_proj, up_proj, down_proj), making the module non-iterable. The HF export code requires `sub_module.experts` to be iterable, causing a NotImplementedError during `export_hf_checkpoint`. Refactor _QuantQwen3VLMoeTextExperts to use per-expert module containers (matching the _QuantQwen35MoeExperts pattern): - Add _Qwen3VLMoeExpertModule container class - Register experts as numbered children (experts.{id}.gate_proj.weight) - Implement __len__/__iter__/__getitem__ for iterability - Add Qwen3VLMoeSparseMoeBlock to get_expert_linear_names Signed-off-by: Shengliang Xu --- modelopt/torch/export/layer_utils.py | 1 + .../torch/quantization/plugins/huggingface.py | 72 ++++++++++++------- 2 files changed, 48 insertions(+), 25 deletions(-) diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py index 9a2cd4b2f0..7f203c2f12 100755 --- a/modelopt/torch/export/layer_utils.py +++ b/modelopt/torch/export/layer_utils.py @@ -972,6 +972,7 @@ def module_match_name_list(module, name_list): "Qwen3MoeSparseMoeBlock", "Qwen3NextSparseMoeBlock", "Qwen3_5MoeSparseMoeBlock", + "Qwen3VLMoeSparseMoeBlock", "DeepseekMoE", ], ): diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index 0d02716a6e..bc184dba2b 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -687,9 +687,27 @@ def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor: return self.w2_linear[expert_idx](x1) +class _Qwen3VLMoeExpertModule(nn.Module): + """Container for a single Qwen3VL MoE expert's linear layers. + + Produces the naming pattern: experts.{id}.gate_proj.weight + (consistent with standard Qwen3 MoE per-expert module structure). + """ + + def __init__(self, hidden_size: int, expert_dim: int): + super().__init__() + self.gate_proj = nn.Linear(hidden_size, expert_dim, bias=False) + self.up_proj = nn.Linear(hidden_size, expert_dim, bias=False) + self.down_proj = nn.Linear(expert_dim, hidden_size, bias=False) + + class _QuantQwen3VLMoeTextExperts(QuantModule): def _setup(self): - """Modify the Qwen3VLMoeTextExperts by using nn.Linear layers.""" + """Modify the Qwen3VLMoeTextExperts by using per-expert nn.Module containers. + + This produces the naming pattern: experts.{id}.gate_proj.weight + (consistent with standard Qwen3 MoE per-expert module structure). + """ from accelerate import init_empty_weights dtype, device = self.gate_up_proj.dtype, self.gate_up_proj.device @@ -709,35 +727,37 @@ def _copy_weight(module, weight): raise AttributeError("Could not find intermediate dimension size in model") with init_empty_weights(): - gate_proj = nn.ModuleList( - [ - nn.Linear(self.hidden_size, expert_dim, bias=False) - for _ in range(self.num_experts) - ] - ) - up_proj = nn.ModuleList( - [ - nn.Linear(self.hidden_size, expert_dim, bias=False) - for _ in range(self.num_experts) - ] - ) - down_proj = nn.ModuleList( + expert_modules = nn.ModuleList( [ - nn.Linear(expert_dim, self.hidden_size, bias=False) + _Qwen3VLMoeExpertModule(self.hidden_size, expert_dim) for _ in range(self.num_experts) ] ) for idx in range(self.num_experts): - _copy_weight(gate_proj[idx], self.gate_up_proj[idx, :, :expert_dim].T) - _copy_weight(up_proj[idx], self.gate_up_proj[idx, :, expert_dim:].T) - _copy_weight(down_proj[idx], self.down_proj[idx, :].T) + _copy_weight(expert_modules[idx].gate_proj, self.gate_up_proj[idx, :, :expert_dim].T) + _copy_weight(expert_modules[idx].up_proj, self.gate_up_proj[idx, :, expert_dim:].T) + _copy_weight(expert_modules[idx].down_proj, self.down_proj[idx, :].T) delattr(self, "gate_up_proj") delattr(self, "down_proj") - self.gate_proj = gate_proj - self.up_proj = up_proj - self.down_proj = down_proj + # Register expert modules directly as numbered children + # so the naming pattern is: experts.{id}.gate_proj.weight (no extra nesting) + for idx in range(self.num_experts): + self.add_module(str(idx), expert_modules[idx]) + + def __len__(self): + """Support len() so the module is iterable like standard MoE experts.""" + return self.num_experts + + def __iter__(self): + """Support iteration over expert modules.""" + for idx in range(self.num_experts): + yield getattr(self, str(idx)) + + def __getitem__(self, idx): + """Support indexing to get individual expert modules.""" + return getattr(self, str(int(idx))) def forward( self, @@ -753,13 +773,15 @@ def forward( expert_mask = expert_mask.permute(2, 1, 0) expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero() for expert_idx in expert_hit: + expert_idx = expert_idx[0] with torch.no_grad(): - _, token_idx = torch.where(expert_mask[expert_idx[0]]) + _, token_idx = torch.where(expert_mask[expert_idx]) current_state = hidden_states[token_idx] - gate = self.gate_proj[expert_idx](current_state) - up = self.up_proj[expert_idx](current_state) + expert = self[expert_idx] + gate = expert.gate_proj(current_state) + up = expert.up_proj(current_state) gated_output = up * self.act_fn(gate) - out = self.down_proj[expert_idx](gated_output) + out = expert.down_proj(gated_output) weighted_output = out * routing_weights[token_idx, expert_idx, None] next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype)) next_states = next_states.view(batch_size, -1, self.hidden_size) From adc3817df60893b82b8257f42c800c5f2d7c2f2d Mon Sep 17 00:00:00 2001 From: Shengliang Xu Date: Thu, 2 Apr 2026 15:39:31 -0700 Subject: [PATCH 2/2] correct name Signed-off-by: Shengliang Xu --- modelopt/torch/export/layer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py index 7f203c2f12..30f51ec43b 100755 --- a/modelopt/torch/export/layer_utils.py +++ b/modelopt/torch/export/layer_utils.py @@ -972,7 +972,7 @@ def module_match_name_list(module, name_list): "Qwen3MoeSparseMoeBlock", "Qwen3NextSparseMoeBlock", "Qwen3_5MoeSparseMoeBlock", - "Qwen3VLMoeSparseMoeBlock", + "Qwen3VLMoeTextSparseMoeBlock", "DeepseekMoE", ], ):