Address CodeRabbit review

moraxu · moraxu · commit 11ab076a0ffe · 2026-05-19T12:57:26.000-07:00
Signed-off-by: Michal Guzek &lt;mguzek@nvidia.com&gt;
diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml
@@ -64,6 +64,8 @@ Qwen/Qwen3.5-35B-A3B:
   - accuracy: 59.0
   - dtype: bfloat16
     accuracy: 60.444
+  - quant_algo: FP8_BLOCK_SCALES
+    accuracy: 58.889
 # Kimi K2.5 multimodal (MoonViT + DeepSeek-V3 MoE backbone, ~1T params).
 # Values below are measured with NVFP4 checkpoint (thinking mode enabled).
 moonshotai/Kimi-K2.5:
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
@@ -441,13 +441,24 @@ class TestQwen3_5_35B_A3B_VL(LlmapiAccuracyTestHarness):
 
     kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6, enable_block_reuse=False)
 
-    def test_auto_dtype(self) -> None:
-        with LLM(
-            self.MODEL_PATH,
+    def _make_llm(self, model_path: str) -> LLM:
+        return LLM(
+            model_path,
             max_num_tokens=self.MAX_NUM_TOKENS,
             max_batch_size=self.MAX_BATCH_SIZE,
             kv_cache_config=self.kv_cache_config,
-        ) as llm:
+        )
+
+    def test_auto_dtype(self) -> None:
+        with self._make_llm(self.MODEL_PATH) as llm:
+            task = MMMU(self.MODEL_NAME)
+            task.evaluate(llm, sampling_params=self.sampling_params)
+
+    @skip_pre_hopper
+    def test_fp8_prequantized(self) -> None:
+        model_path = f"{llm_models_root()}/Qwen3.5-35B-A3B-FP8"
+        with self._make_llm(model_path) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
             task = MMMU(self.MODEL_NAME)
             task.evaluate(llm, sampling_params=self.sampling_params)
 
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -798,6 +798,7 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL::test_auto_dtype[forced
 accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype
 accuracy/test_llm_api_pytorch_multimodal.py::TestKimiK25::test_nvfp4[dep8]
 accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3_5_35B_A3B_VL::test_auto_dtype
+accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3_5_35B_A3B_VL::test_fp8_prequantized
 accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
 accuracy/test_llm_api_pytorch_ray.py::TestLlama3_1_8BInstruct::test_pp2_ray
 unittest/disaggregated/test_openai_disagg_server.py
diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py b/tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py
@@ -400,13 +400,18 @@ def get_trtllm_inputs(
         return trtllm_inputs
 
     def get_scenarios(self) -> List[MultimodalScenario]:
-        """Minimal scenario sweep for the initial coverage.
-
-        Starts with one image scenario, no CUDA graph / chunked
-        prefill / kv-cache reuse — those add additional surface area
-        (mRoPE handling under graph capture, multimodal cumsum under
-        chunking, etc.) that's worth adding incrementally once the
-        baseline parity passes.
+        """Modality-sanity sweep (image / multiple_image / video).
+
+        These three catch differences in placeholder counts and the
+        multimodal-cumsum path between single-image, multi-image, and
+        video inputs.
+
+        CUDA-graph capture is intentionally not exercised here. The
+        standard `attn_metadata.create_cuda_graph_metadata` path only
+        addresses attention metadata; the Mamba SSM state buffer of the
+        hybrid (Mamba + attention) cache is not threaded through, so
+        replayed logits diverge from the HF reference. Adding that path
+        is dedicated harness work and tracked separately.
         """
         return [
             MultimodalScenario(
@@ -415,6 +420,18 @@ def get_scenarios(self) -> List[MultimodalScenario]:
                 chunked_prefill=False,
                 kv_cache_reuse=False,
             ),
+            MultimodalScenario(
+                modality="multiple_image",
+                use_cuda_graph=False,
+                chunked_prefill=False,
+                kv_cache_reuse=False,
+            ),
+            MultimodalScenario(
+                modality="video",
+                use_cuda_graph=False,
+                chunked_prefill=False,
+                kv_cache_reuse=False,
+            ),
         ]
 
     def test_construction_and_weight_loading_smoke(self):