Support Qwen3VLMoeTextExperts ModuleList pattern in export resmooth/fusion

shengliangxu · shengliangxu · commit daf01444951a · 2026-04-02T10:09:02.000-07:00
- Add "qwen3vlmoe" to get_experts_list() model type recognition

- Handle per-linear ModuleList expert structure (experts.gate_proj[i])
  in addition to standard per-expert structure (experts[i].gate_proj)

- Extend expert naming regex in requantize_resmooth_fused_llm_layers to
  match "experts.gate_proj.0" pattern for uncalibrated expert fusion

- Update sync_moe_gate_up_amax to sync gate/up weight quantizer amaxes
  for ModuleList-pattern experts

Signed-off-by: Shengliang Xu &lt;shengliangx@nvidia.com&gt;
diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py
@@ -95,19 +95,33 @@ def get_experts_list(module: torch.nn.Module, model_type: str):
             "qwen2moeforcausallm",
             "qwen3moeforcausallm",
             "qwen3nextforcausallm",
+            "qwen3vlmoe",
         ]
     ):
         linear_names = ["gate_proj", "down_proj", "up_proj"]
     else:
         raise NotImplementedError(f" {model_type} not supported")
 
-    # Common logic for all supported model types
-    experts_list.extend(
-        [
-            [_get_expert_attr(module.experts, i, linear_name) for i in range(len(module.experts))]
-            for linear_name in linear_names
-        ]
-    )
+    # Check if experts use per-linear ModuleList structure (e.g., Qwen3VLMoeTextExperts)
+    # where experts.gate_proj is a ModuleList, instead of experts[i].gate_proj
+    first_linear = linear_names[0]
+    if hasattr(module.experts, first_linear) and isinstance(
+        getattr(module.experts, first_linear), nn.ModuleList
+    ):
+        experts_list.extend(
+            [list(getattr(module.experts, linear_name)) for linear_name in linear_names]
+        )
+    else:
+        # Standard per-expert structure: experts[i].linear_name
+        experts_list.extend(
+            [
+                [
+                    _get_expert_attr(module.experts, i, linear_name)
+                    for i in range(len(module.experts))
+                ]
+                for linear_name in linear_names
+            ]
+        )
 
     return experts_list
 
@@ -1150,6 +1164,24 @@ def set_expert_quantizer_amax(
 _GATE_UP_PAIRS = [("gate_proj", "up_proj"), ("w1", "w3")]
 
 
+def _sync_gate_up_pair(gate_linear, up_linear) -> bool:
+    """Sync weight quantizer amaxes for a single gate/up pair. Returns True if synced."""
+    gate_wq = getattr(gate_linear, "weight_quantizer", None)
+    up_wq = getattr(up_linear, "weight_quantizer", None)
+    if gate_wq is None or up_wq is None:
+        return False
+    gate_amax = getattr(gate_wq, "amax", None)
+    up_amax = getattr(up_wq, "amax", None)
+    if gate_amax is None or up_amax is None:
+        return False
+    if not torch.equal(gate_amax, up_amax):
+        shared_amax = torch.max(gate_amax, up_amax)
+        gate_wq.amax = shared_amax
+        up_wq.amax = shared_amax.clone()
+        return True
+    return False
+
+
 def sync_moe_gate_up_amax(model: nn.Module) -> int:
     """Take element-wise max of gate and up weight quantizer amaxes per expert.
 
@@ -1162,35 +1194,43 @@ def sync_moe_gate_up_amax(model: nn.Module) -> int:
     (e.g. Qwen MoE, DeepSeek). Models with already-fused gate_up_proj
     (e.g. Llama4, GptOss) are unaffected.
 
+    Supports both standard per-expert structure (experts[i].gate_proj) and
+    per-linear ModuleList structure (experts.gate_proj[i], e.g. Qwen3VLMoeTextExperts).
+
     Returns:
         Number of expert gate/up pairs whose amaxes were synced.
     """
     synced = 0
     for _, sub_module in model.named_modules():
         if not (is_moe(sub_module) and hasattr(sub_module, "experts")):
             continue
-        if not hasattr(sub_module.experts, "__iter__"):
-            continue
-        for expert in sub_module.experts:
-            for gate_name, up_name in _GATE_UP_PAIRS:
-                gate_linear = getattr(expert, gate_name, None)
-                up_linear = getattr(expert, up_name, None)
-                if gate_linear is None or up_linear is None:
-                    continue
-                gate_wq = getattr(gate_linear, "weight_quantizer", None)
-                up_wq = getattr(up_linear, "weight_quantizer", None)
-                if gate_wq is None or up_wq is None:
-                    break
-                gate_amax = getattr(gate_wq, "amax", None)
-                up_amax = getattr(up_wq, "amax", None)
-                if gate_amax is None or up_amax is None:
+
+        experts = sub_module.experts
+
+        # Check for per-linear ModuleList structure (e.g., Qwen3VLMoeTextExperts)
+        # where experts.gate_proj is a ModuleList instead of experts[i].gate_proj
+        is_modulelist_pattern = False
+        for gate_name, up_name in _GATE_UP_PAIRS:
+            gate_list = getattr(experts, gate_name, None)
+            up_list = getattr(experts, up_name, None)
+            if isinstance(gate_list, nn.ModuleList) and isinstance(up_list, nn.ModuleList):
+                for gate_linear, up_linear in zip(gate_list, up_list):
+                    if _sync_gate_up_pair(gate_linear, up_linear):
+                        synced += 1
+                is_modulelist_pattern = True
+                break  # Found matching pair pattern, no need to check others
+
+        # Standard per-expert structure: experts[i].gate_proj
+        if not is_modulelist_pattern and hasattr(experts, "__iter__"):
+            for expert in experts:
+                for gate_name, up_name in _GATE_UP_PAIRS:
+                    gate_linear = getattr(expert, gate_name, None)
+                    up_linear = getattr(expert, up_name, None)
+                    if gate_linear is None or up_linear is None:
+                        continue
+                    if _sync_gate_up_pair(gate_linear, up_linear):
+                        synced += 1
                     break
-                if not torch.equal(gate_amax, up_amax):
-                    shared_amax = torch.max(gate_amax, up_amax)
-                    gate_wq.amax = shared_amax
-                    up_wq.amax = shared_amax.clone()
-                    synced += 1
-                break
     return synced
 
 
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -396,27 +396,37 @@ def llm_dummy_forward():
 
     # The dummy forward may not be able to activate all the experts.
     # Process experts by naming rules like experts.0, experts.1, etc.
+    # Also handle ModuleList pattern: experts.gate_proj.0, experts.up_proj.0, etc.
     for name, modules_fused in fused_linears.items():
+        # Determine expert naming pattern:
+        # Standard: "experts.0.gate_proj" → expert index right after "experts."
+        # ModuleList: "experts.gate_proj.0" → expert index after linear name
         if re.search(r"experts?\.\d+", name):
-            expert_id = 0
-            while True:
-                new_expert_name = re.sub(r"(experts?\.)\d+", rf"\g<1>{expert_id}", name, count=1)
-                if new_expert_name in fused_linears:
-                    expert_id += 1
-                    continue
-                if new_expert_name not in module_names:
-                    break
-
-                new_expert_modules = []
-                for name_fused in modules_fused:
-                    new_expert_name = re.sub(r"(experts?\.)\d+", rf"\g<1>{expert_id}", name_fused)
-                    assert new_expert_name in module_names
-                    new_expert_modules.append(model.get_submodule(new_expert_name))
-
-                with fsdp2_aware_weight_update(model, new_expert_modules):
-                    preprocess_linear_fusion(new_expert_modules)
+            pattern = r"(experts?\.)\d+"
+        elif re.search(r"experts?\.[a-zA-Z_]\w*\.\d+", name):
+            pattern = r"(experts?\.[a-zA-Z_]\w*\.)\d+"
+        else:
+            continue
 
+        expert_id = 0
+        while True:
+            new_expert_name = re.sub(pattern, rf"\g<1>{expert_id}", name, count=1)
+            if new_expert_name in fused_linears:
                 expert_id += 1
+                continue
+            if new_expert_name not in module_names:
+                break
+
+            new_expert_modules = []
+            for name_fused in modules_fused:
+                new_expert_name = re.sub(pattern, rf"\g<1>{expert_id}", name_fused)
+                assert new_expert_name in module_names
+                new_expert_modules.append(model.get_submodule(new_expert_name))
+
+            with fsdp2_aware_weight_update(model, new_expert_modules):
+                preprocess_linear_fusion(new_expert_modules)
+
+            expert_id += 1
 
 
 def _export_quantized_weight(