Skip to content

Commit 11ab076

Browse files
committed
Address CodeRabbit review
Signed-off-by: Michal Guzek <mguzek@nvidia.com>
1 parent 9438b0d commit 11ab076

4 files changed

Lines changed: 42 additions & 11 deletions

File tree

tests/integration/defs/accuracy/references/mmmu.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ Qwen/Qwen3.5-35B-A3B:
6464
- accuracy: 59.0
6565
- dtype: bfloat16
6666
accuracy: 60.444
67+
- quant_algo: FP8_BLOCK_SCALES
68+
accuracy: 58.889
6769
# Kimi K2.5 multimodal (MoonViT + DeepSeek-V3 MoE backbone, ~1T params).
6870
# Values below are measured with NVFP4 checkpoint (thinking mode enabled).
6971
moonshotai/Kimi-K2.5:

tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -441,13 +441,24 @@ class TestQwen3_5_35B_A3B_VL(LlmapiAccuracyTestHarness):
441441

442442
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6, enable_block_reuse=False)
443443

444-
def test_auto_dtype(self) -> None:
445-
with LLM(
446-
self.MODEL_PATH,
444+
def _make_llm(self, model_path: str) -> LLM:
445+
return LLM(
446+
model_path,
447447
max_num_tokens=self.MAX_NUM_TOKENS,
448448
max_batch_size=self.MAX_BATCH_SIZE,
449449
kv_cache_config=self.kv_cache_config,
450-
) as llm:
450+
)
451+
452+
def test_auto_dtype(self) -> None:
453+
with self._make_llm(self.MODEL_PATH) as llm:
454+
task = MMMU(self.MODEL_NAME)
455+
task.evaluate(llm, sampling_params=self.sampling_params)
456+
457+
@skip_pre_hopper
458+
def test_fp8_prequantized(self) -> None:
459+
model_path = f"{llm_models_root()}/Qwen3.5-35B-A3B-FP8"
460+
with self._make_llm(model_path) as llm:
461+
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
451462
task = MMMU(self.MODEL_NAME)
452463
task.evaluate(llm, sampling_params=self.sampling_params)
453464

tests/integration/test_lists/qa/llm_function_core.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -798,6 +798,7 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL::test_auto_dtype[forced
798798
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype
799799
accuracy/test_llm_api_pytorch_multimodal.py::TestKimiK25::test_nvfp4[dep8]
800800
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3_5_35B_A3B_VL::test_auto_dtype
801+
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3_5_35B_A3B_VL::test_fp8_prequantized
801802
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
802803
accuracy/test_llm_api_pytorch_ray.py::TestLlama3_1_8BInstruct::test_pp2_ray
803804
unittest/disaggregated/test_openai_disagg_server.py

tests/unittest/_torch/modeling/test_modeling_qwen3_5_vl_moe.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -400,13 +400,18 @@ def get_trtllm_inputs(
400400
return trtllm_inputs
401401

402402
def get_scenarios(self) -> List[MultimodalScenario]:
403-
"""Minimal scenario sweep for the initial coverage.
404-
405-
Starts with one image scenario, no CUDA graph / chunked
406-
prefill / kv-cache reuse — those add additional surface area
407-
(mRoPE handling under graph capture, multimodal cumsum under
408-
chunking, etc.) that's worth adding incrementally once the
409-
baseline parity passes.
403+
"""Modality-sanity sweep (image / multiple_image / video).
404+
405+
These three catch differences in placeholder counts and the
406+
multimodal-cumsum path between single-image, multi-image, and
407+
video inputs.
408+
409+
CUDA-graph capture is intentionally not exercised here. The
410+
standard `attn_metadata.create_cuda_graph_metadata` path only
411+
addresses attention metadata; the Mamba SSM state buffer of the
412+
hybrid (Mamba + attention) cache is not threaded through, so
413+
replayed logits diverge from the HF reference. Adding that path
414+
is dedicated harness work and tracked separately.
410415
"""
411416
return [
412417
MultimodalScenario(
@@ -415,6 +420,18 @@ def get_scenarios(self) -> List[MultimodalScenario]:
415420
chunked_prefill=False,
416421
kv_cache_reuse=False,
417422
),
423+
MultimodalScenario(
424+
modality="multiple_image",
425+
use_cuda_graph=False,
426+
chunked_prefill=False,
427+
kv_cache_reuse=False,
428+
),
429+
MultimodalScenario(
430+
modality="video",
431+
use_cuda_graph=False,
432+
chunked_prefill=False,
433+
kv_cache_reuse=False,
434+
),
418435
]
419436

420437
def test_construction_and_weight_loading_smoke(self):

0 commit comments

Comments
 (0)