From 48d4ffdf22c63a9622e40f7399c054117f85527b Mon Sep 17 00:00:00 2001
From: Shengliang Xu <shengliangx@nvidia.com>
Date: Thu, 2 Apr 2026 15:20:59 -0700
Subject: [PATCH 1/2] Fix Qwen3-VL MoE export by refactoring experts to
 per-expert module containers

Qwen3VLMoeTextExperts stored expert weights as flat ModuleLists
(gate_proj, up_proj, down_proj), making the module non-iterable. The HF
export code requires `sub_module.experts` to be iterable, causing a
NotImplementedError during `export_hf_checkpoint`.

Refactor _QuantQwen3VLMoeTextExperts to use per-expert module
containers (matching the _QuantQwen35MoeExperts pattern):

- Add _Qwen3VLMoeExpertModule container class
- Register experts as numbered children (experts.{id}.gate_proj.weight)
- Implement __len__/__iter__/__getitem__ for iterability
- Add Qwen3VLMoeSparseMoeBlock to get_expert_linear_names

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>
---
 modelopt/torch/export/layer_utils.py          |  1 +
 .../torch/quantization/plugins/huggingface.py | 72 ++++++++++++-------
 2 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py
index 9a2cd4b2f0..7f203c2f12 100755
--- a/modelopt/torch/export/layer_utils.py
+++ b/modelopt/torch/export/layer_utils.py
@@ -972,6 +972,7 @@ def module_match_name_list(module, name_list):
             "Qwen3MoeSparseMoeBlock",
             "Qwen3NextSparseMoeBlock",
             "Qwen3_5MoeSparseMoeBlock",
+            "Qwen3VLMoeSparseMoeBlock",
             "DeepseekMoE",
         ],
     ):
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
index 0d02716a6e..bc184dba2b 100644
--- a/modelopt/torch/quantization/plugins/huggingface.py
+++ b/modelopt/torch/quantization/plugins/huggingface.py
@@ -687,9 +687,27 @@ def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
         return self.w2_linear[expert_idx](x1)
 
 
+class _Qwen3VLMoeExpertModule(nn.Module):
+    """Container for a single Qwen3VL MoE expert's linear layers.
+
+    Produces the naming pattern: experts.{id}.gate_proj.weight
+    (consistent with standard Qwen3 MoE per-expert module structure).
+    """
+
+    def __init__(self, hidden_size: int, expert_dim: int):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, expert_dim, bias=False)
+        self.up_proj = nn.Linear(hidden_size, expert_dim, bias=False)
+        self.down_proj = nn.Linear(expert_dim, hidden_size, bias=False)
+
+
 class _QuantQwen3VLMoeTextExperts(QuantModule):
     def _setup(self):
-        """Modify the Qwen3VLMoeTextExperts by using nn.Linear layers."""
+        """Modify the Qwen3VLMoeTextExperts by using per-expert nn.Module containers.
+
+        This produces the naming pattern: experts.{id}.gate_proj.weight
+        (consistent with standard Qwen3 MoE per-expert module structure).
+        """
         from accelerate import init_empty_weights
 
         dtype, device = self.gate_up_proj.dtype, self.gate_up_proj.device
@@ -709,35 +727,37 @@ def _copy_weight(module, weight):
             raise AttributeError("Could not find intermediate dimension size in model")
 
         with init_empty_weights():
-            gate_proj = nn.ModuleList(
-                [
-                    nn.Linear(self.hidden_size, expert_dim, bias=False)
-                    for _ in range(self.num_experts)
-                ]
-            )
-            up_proj = nn.ModuleList(
-                [
-                    nn.Linear(self.hidden_size, expert_dim, bias=False)
-                    for _ in range(self.num_experts)
-                ]
-            )
-            down_proj = nn.ModuleList(
+            expert_modules = nn.ModuleList(
                 [
-                    nn.Linear(expert_dim, self.hidden_size, bias=False)
+                    _Qwen3VLMoeExpertModule(self.hidden_size, expert_dim)
                     for _ in range(self.num_experts)
                 ]
             )
 
         for idx in range(self.num_experts):
-            _copy_weight(gate_proj[idx], self.gate_up_proj[idx, :, :expert_dim].T)
-            _copy_weight(up_proj[idx], self.gate_up_proj[idx, :, expert_dim:].T)
-            _copy_weight(down_proj[idx], self.down_proj[idx, :].T)
+            _copy_weight(expert_modules[idx].gate_proj, self.gate_up_proj[idx, :, :expert_dim].T)
+            _copy_weight(expert_modules[idx].up_proj, self.gate_up_proj[idx, :, expert_dim:].T)
+            _copy_weight(expert_modules[idx].down_proj, self.down_proj[idx, :].T)
 
         delattr(self, "gate_up_proj")
         delattr(self, "down_proj")
-        self.gate_proj = gate_proj
-        self.up_proj = up_proj
-        self.down_proj = down_proj
+        # Register expert modules directly as numbered children
+        # so the naming pattern is: experts.{id}.gate_proj.weight (no extra nesting)
+        for idx in range(self.num_experts):
+            self.add_module(str(idx), expert_modules[idx])
+
+    def __len__(self):
+        """Support len() so the module is iterable like standard MoE experts."""
+        return self.num_experts
+
+    def __iter__(self):
+        """Support iteration over expert modules."""
+        for idx in range(self.num_experts):
+            yield getattr(self, str(idx))
+
+    def __getitem__(self, idx):
+        """Support indexing to get individual expert modules."""
+        return getattr(self, str(int(idx)))
 
     def forward(
         self,
@@ -753,13 +773,15 @@ def forward(
             expert_mask = expert_mask.permute(2, 1, 0)
             expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
         for expert_idx in expert_hit:
+            expert_idx = expert_idx[0]
             with torch.no_grad():
-                _, token_idx = torch.where(expert_mask[expert_idx[0]])
+                _, token_idx = torch.where(expert_mask[expert_idx])
             current_state = hidden_states[token_idx]
-            gate = self.gate_proj[expert_idx](current_state)
-            up = self.up_proj[expert_idx](current_state)
+            expert = self[expert_idx]
+            gate = expert.gate_proj(current_state)
+            up = expert.up_proj(current_state)
             gated_output = up * self.act_fn(gate)
-            out = self.down_proj[expert_idx](gated_output)
+            out = expert.down_proj(gated_output)
             weighted_output = out * routing_weights[token_idx, expert_idx, None]
             next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype))
         next_states = next_states.view(batch_size, -1, self.hidden_size)

From adc3817df60893b82b8257f42c800c5f2d7c2f2d Mon Sep 17 00:00:00 2001
From: Shengliang Xu <shengliangx@nvidia.com>
Date: Thu, 2 Apr 2026 15:39:31 -0700
Subject: [PATCH 2/2] correct name

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>
---
 modelopt/torch/export/layer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py
index 7f203c2f12..30f51ec43b 100755
--- a/modelopt/torch/export/layer_utils.py
+++ b/modelopt/torch/export/layer_utils.py
@@ -972,7 +972,7 @@ def module_match_name_list(module, name_list):
             "Qwen3MoeSparseMoeBlock",
             "Qwen3NextSparseMoeBlock",
             "Qwen3_5MoeSparseMoeBlock",
-            "Qwen3VLMoeSparseMoeBlock",
+            "Qwen3VLMoeTextSparseMoeBlock",
             "DeepseekMoE",
         ],
     ):