Address review comments

moraxu · moraxu · commit dbf97672973a · 2026-05-19T13:53:11.000-07:00
Signed-off-by: Michal Guzek &lt;mguzek@nvidia.com&gt;
diff --git a/docs/source/models/supported-models.md b/docs/source/models/supported-models.md
@@ -95,6 +95,7 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl
 | `Qwen2_5_VLForConditionalGeneration` | Yes               | Yes        | Yes             | Yes           | Yes              | Yes            | Yes                   | Yes                       | L + I + V |
 | `Qwen3VLForConditionalGeneration`    | Yes               | Yes        | Yes             | Yes           | Yes              | Yes            | Yes                   | Yes                       | L + I + V |
 | `Qwen3VLMoeForConditionalGeneration` | Yes               | Yes        | Yes             | Yes           | Yes              | Yes            | Yes                   | Yes                       | L + I + V |
+| `Qwen3_5MoeForConditionalGeneration` | Yes               | Yes        | Untested        | Yes           | Yes              | No             | Untested              | Yes                       | L + I + V |
 
 Note:
 - L: Language
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_5.py b/tensorrt_llm/_torch/models/modeling_qwen3_5.py
@@ -35,7 +35,7 @@
 # runtime layer asks the model module how to load its own config.
 #
 # There are two entry points:
-#   - `_Qwen35ConfigCompat.normalize(config_dict)` — for text-only
+#   - `Qwen35ConfigCompat.normalize(config_dict)` — for text-only
 #     Qwen3.5 (MoE and dense). Returns a dict that
 #     `transformers.Qwen3NextConfig.from_dict(...)` can consume, so the
 #     existing Qwen3Next runtime is reused unchanged.
@@ -45,7 +45,7 @@
 #     while keeping `text_config` / `vision_config` composite.
 
 
-class _Qwen35ConfigCompat:
+class Qwen35ConfigCompat:
     """Temporary shim for flattening Qwen3.5 text configs into Qwen3NextConfig.
 
     We normalize to `Qwen3NextConfig` (rather than to a Qwen3.5-native
@@ -66,9 +66,9 @@ class _Qwen35ConfigCompat:
     @staticmethod
     def normalize(config_dict: dict) -> dict:
         """Entry point: raw config.json dict -> flat Qwen3NextConfig-compatible dict."""
-        text_config = _Qwen35ConfigCompat._extract_text_config(config_dict)
-        text_config = _Qwen35ConfigCompat._inherit_quantization_config(config_dict, text_config)
-        text_config = _Qwen35ConfigCompat._flatten_rope(text_config)
+        text_config = Qwen35ConfigCompat._extract_text_config(config_dict)
+        text_config = Qwen35ConfigCompat._inherit_quantization_config(config_dict, text_config)
+        text_config = Qwen35ConfigCompat._flatten_rope(text_config)
 
         # Detect dense vs MoE and set architecture + MoE defaults accordingly
         is_moe = "num_experts" in text_config and text_config["num_experts"] > 0
@@ -93,7 +93,7 @@ def normalize(config_dict: dict) -> dict:
     def _extract_text_config(config_dict: dict) -> dict:
         """Pull nested text_config from VLM checkpoints, or use dict as-is."""
         architectures = config_dict.get("architectures") or []
-        if architectures and architectures[0] in _Qwen35ConfigCompat._VLM_ARCHITECTURES:
+        if architectures and architectures[0] in Qwen35ConfigCompat._VLM_ARCHITECTURES:
             text_config = dict(config_dict.get("text_config") or {})
         else:
             text_config = dict(config_dict)
@@ -116,10 +116,10 @@ def _inherit_quantization_config(config_dict: dict, text_config: dict) -> dict:
 
         quantization_config = dict(config_dict["quantization_config"])
         if "modules_to_not_convert" in quantization_config:
-            modules = _Qwen35ConfigCompat._normalize_exclude_modules(
+            modules = Qwen35ConfigCompat._normalize_exclude_modules(
                 quantization_config["modules_to_not_convert"]
             )
-            modules = _Qwen35ConfigCompat._add_qkvz_bf16_workaround(text_config, modules)
+            modules = Qwen35ConfigCompat._add_qkvz_bf16_workaround(text_config, modules)
             quantization_config["modules_to_not_convert"] = sorted(set(modules))
         text_config["quantization_config"] = quantization_config
         return text_config
@@ -209,7 +209,7 @@ def _normalize_qwen35_mrope_config(text_config) -> None:
         return
     if hasattr(rope_parameters, "to_dict"):
         rope_parameters = rope_parameters.to_dict()
-    flattened = _Qwen35ConfigCompat._flatten_rope(
+    flattened = Qwen35ConfigCompat._flatten_rope(
         {
             "rope_parameters": dict(rope_parameters),
             "rope_scaling": dict(getattr(text_config, "rope_scaling", None) or {}),
@@ -245,9 +245,9 @@ def _normalize_qwen35_quantization_config(model_config) -> None:
         return
 
     text_config = getattr(model_config, "text_config", None)
-    normalized_modules = _Qwen35ConfigCompat._normalize_exclude_modules(modules)
+    normalized_modules = Qwen35ConfigCompat._normalize_exclude_modules(modules)
     if text_config is not None:
-        normalized_modules = _Qwen35ConfigCompat._add_qkvz_bf16_workaround(
+        normalized_modules = Qwen35ConfigCompat._add_qkvz_bf16_workaround(
             text_config.to_dict(), normalized_modules
         )
     quantization_config["modules_to_not_convert"] = sorted(set(normalized_modules))
@@ -331,7 +331,7 @@ class Qwen3_5ForCausalLM(Qwen3NextForCausalLM):
 
     Same reuse pattern as Qwen3_5MoeForCausalLM, but for the dense 27B
     variant which uses GatedMLP instead of SparseMoeBlock.  The config
-    normalizer (_Qwen35ConfigCompat) sets num_experts=0 so that
+    normalizer (Qwen35ConfigCompat) sets num_experts=0 so that
     Qwen3NextModel selects GatedMLP for the feed-forward layers.
     """
 
@@ -340,6 +340,7 @@ def __init__(self, model_config):
         super().__init__(model_config)
 
 
+# TODO: Add tests for disaggregated support.
 @support_multimodal_disaggregated
 @register_vision_encoder(Qwen3VisionModelBase, vlm_base_model=Qwen3VisionModel)
 @register_auto_model("Qwen3_5MoeForConditionalGeneration")
diff --git a/tensorrt_llm/_torch/pyexecutor/config_utils.py b/tensorrt_llm/_torch/pyexecutor/config_utils.py
@@ -379,9 +379,9 @@ def load_pretrained_config(model_name_or_path: str,
                             )):
         # Qwen3.5 text-only: flatten to Qwen3NextConfig via the model-side shim.
         from tensorrt_llm._torch.models.modeling_qwen3_5 import \
-            _Qwen35ConfigCompat
+            Qwen35ConfigCompat
         model_config = transformers.Qwen3NextConfig.from_dict(
-            _Qwen35ConfigCompat.normalize(config_dict))
+            Qwen35ConfigCompat.normalize(config_dict))
     elif (model_type == "exaone4" and config_dict.get("sliding_window") is None
           and config_dict.get("layer_types") is None):
         # transformers 5.5.x Exaone4Config.__post_init__ first forces
diff --git a/tests/integration/test_lists/test-db/l0_l40s.yml b/tests/integration/test_lists/test-db/l0_l40s.yml
@@ -23,6 +23,7 @@ l0_l40s:
   - unittest/_torch/modeling/test_modeling_qwen2_5vl.py::TestQwen2_5_VL::test_all
   - unittest/_torch/modeling/test_modeling_qwen3vl_moe.py::TestQwen3VLMoe::test_all
   - unittest/_torch/modeling/test_modeling_qwen3vl.py::TestQwen3VL::test_all
+  - unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py::TestQwen3_5MoeVL::test_all
   - test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B]
   - unittest/llmapi/apps/_test_openai_chat_multimodal.py::test_single_chat_session_image_embeds -m needs_l40s
   # MMMU sanity check
diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py
@@ -325,13 +325,9 @@ def create_trtllm_model(
         model = model_class(model_config, **kwargs).to("cuda")
 
         if load_weights:
-            weight_mapper_class = self.get_weight_mapper_class()
-            if weight_mapper_class is not None:
-                weight_mapper = weight_mapper_class()
-                weight_mapper.init_model_and_config(model, trtllm_config)
-                model.load_weights(hf_model_state_dict, weight_mapper)
-            else:
-                model.load_weights(hf_model_state_dict)
+            weight_mapper = self.get_weight_mapper_class()()
+            weight_mapper.init_model_and_config(model, trtllm_config)
+            model.load_weights(hf_model_state_dict, weight_mapper)
 
             for module in model.modules():
                 if hasattr(module, "post_load_weights") and not getattr(
@@ -346,6 +342,13 @@ def _dummy_request_kwargs(self, scenario):
         position-id buffer allocated at dummy-request time."""
         return {"use_mrope": True}
 
+    def get_tolerance(self):
+        """Tighten `rtol` to `0.1` (4x tighter than the base 0.4
+        default) while keeping `atol` at `0.4` to absorb single-logit
+        tail outliers seen on `multiple_image` / `video`.
+        """
+        return 0.4, 0.1
+
     def get_trtllm_inputs(
         self,
         input_ids,