NVIDIA
diff --git a/‎jenkins/L0_Test.groovy‎
Lines changed: 1 addition & 9 deletions b/‎jenkins/L0_Test.groovy‎
Lines changed: 1 addition & 9 deletions
diff --git a/‎scripts/check_test_list.py‎
Lines changed: 5 additions & 0 deletions b/‎scripts/check_test_list.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py‎
Lines changed: 62 additions & 118 deletions b/‎tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py‎
Lines changed: 62 additions & 118 deletions
@@ -4467,15 +4467,7 @@ def launchTestJobs(pipeline, testFilter)
         "DGX_H100-4_GPUs-PyTorch-Ray-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
         "DGX_H100-4_GPUs-AutoDeploy-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
         "DGX_H100-4_GPUs-AutoDeploy-Post-Merge-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
-        "DGX_B200-PyTorch-1": ["auto:dgx-b200-flex", "l0_b200", 1, 9, 1, 1, true],
-        "DGX_B200-PyTorch-2": ["auto:dgx-b200-flex", "l0_b200", 2, 9, 1, 1, true],
-        "DGX_B200-PyTorch-3": ["auto:dgx-b200-flex", "l0_b200", 3, 9, 1, 1, true],
-        "DGX_B200-PyTorch-4": ["auto:dgx-b200-flex", "l0_b200", 4, 9, 1, 1, true],
-        "DGX_B200-PyTorch-5": ["auto:dgx-b200-flex", "l0_b200", 5, 9, 1, 1, true],
-        "DGX_B200-PyTorch-6": ["auto:dgx-b200-flex", "l0_b200", 6, 9, 1, 1, true],
-        "DGX_B200-PyTorch-7": ["auto:dgx-b200-flex", "l0_b200", 7, 9, 1, 1, true],
-        "DGX_B200-PyTorch-8": ["auto:dgx-b200-flex", "l0_b200", 8, 9, 1, 1, true],
-        "DGX_B200-PyTorch-9": ["auto:dgx-b200-flex", "l0_b200", 9, 9, 1, 1, true],
+        "DGX_B200-PyTorch-1": ["auto:dgx-b200-flex", "l0_b200", 1, 1, 1, 1, true],
         "DGX_B200-AutoDeploy-1": ["auto:dgx-b200-flex", "l0_b200", 1, 1, 1, 1, true],
         "DGX_B200-Triton-Post-Merge-1": ["auto:dgx-b200-flex", "l0_b200", 1, 1, 1, 1, true],
         "DGX_B200-PyTorch-Post-Merge-1": ["auto:dgx-b200-flex", "l0_b200", 1, 2, 1, 1, true],
 
@@ -720,6 +720,11 @@ def verify_waive_list(llm_src, args):
     with open(tmp_waives_file, "w") as f:
         f.writelines(f"{line}\n" for line in sorted(processed_lines))
 
+    if not processed_lines:
+        print("No integration waive entries found; skipping collection.",
+              flush=True)
+        return
+
     subprocess.run(
         f"cd {llm_src}/tests/integration/defs && "
         f"pytest --test-list={tmp_waives_file} --output-dir={llm_src} -s --co -q",
 
@@ -23,7 +23,7 @@
     MTPDecodingConfig,
     SamplingParams,
 )
-from tensorrt_llm.llmapi.llm_args import MultimodalConfig, MultimodalEncoderCudaGraphConfig
+from tensorrt_llm.llmapi.llm_args import MultimodalConfig
 from tensorrt_llm.quantization import QuantAlgo
 
 from ..conftest import (
@@ -605,132 +605,75 @@ def test_auto_dtype(self, max_num_tokens):
             task.evaluate(llm, sampling_params=self.sampling_params)
 
 
+# The score here may be lower than VLMEvalKitMcore (official) runs. This path uses
+# lm_eval's MMMU task, prompt formatting, and scoring, while VLMEvalKitMcore
+# uses MMMU_DEV_VAL with its own MCQ prompt builder, answer extraction, and
+# explicit image tiling/token accounting in the Mcore wrapper.
+# We also keep the generation budget small for CI speed, and this evaluator
+# does not strip reasoning traces after </think> before scoring. If the model
+# ignores the non-thinking directive, answer extraction may see the reasoning.
+EXTRA_EVALUATOR_KWARGS = dict(
+    apply_chat_template=True,
+    is_multimodal=True,
+)
+
+# NOTE: MMMU adds <|endoftext|> to the stop token.
+sampling_params = SamplingParams(
+    max_tokens=MMMU.MAX_OUTPUT_LEN,
+    truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
+    stop="<|endoftext|>",
+    temperature=0.0,
+    top_k=1,
+)
+MMMU_TASK_SPEC = (MMMU, sampling_params, EXTRA_EVALUATOR_KWARGS)
+
+voxpopuli_sampling_params = SamplingParams(
+    max_tokens=512,
+    truncate_prompt_tokens=VoxPopuli.MAX_INPUT_LEN,
+    temperature=0.0,
+    top_k=1,
+)
+no_thinking_evaluator_kwargs = {
+    # We explicitly disable thinking, because otherwise the thinking traces could
+    # be absurdly long (20k+ tokens), which is not helpful for test-runtime, nor
+    # for reproducibility (the more tokens there are, the higher likelihood of the
+    # end output not being the same).
+    # In addition, if reasoning is cut off, then the WER goes through the roof,
+    # since each word in the output is treated as an error.
+    "chat_template_kwargs": {"enable_thinking": False},
+}
+VOXPOPULI_TASK_SPEC = (
+    VoxPopuli,
+    voxpopuli_sampling_params,
+    no_thinking_evaluator_kwargs,
+)
+
+videomme_sampling_params = SamplingParams(
+    max_tokens=VideoMME.MAX_OUTPUT_LEN,
+    truncate_prompt_tokens=VideoMME.MAX_INPUT_LEN,
+    temperature=0.0,
+    top_k=1,
+)
+VIDEOMME_TASK_SPEC = (
+    VideoMME,
+    videomme_sampling_params,
+    no_thinking_evaluator_kwargs,
+)
+
+
 # Skip for B300 / GB300:
 # * B300 coverage does not meaningfully extend what we test via B200.
 # * GB300 may not be entirely up to date for `llm-models`, leading to repo-wide CI errors.
 @skip_post_blackwell_ultra
 class TestNanoV3Omni(LlmapiAccuracyTestHarness):
-    # The score here may be lower than VLMEvalKitMcore (official) runs. This path uses
-    # lm_eval's MMMU task, prompt formatting, and scoring, while VLMEvalKitMcore
-    # uses MMMU_DEV_VAL with its own MCQ prompt builder, answer extraction, and
-    # explicit image tiling/token accounting in the Mcore wrapper.
-    # We also keep the generation budget small for CI speed, and this evaluator
-    # does not strip reasoning traces after </think> before scoring. If the model
-    # ignores the non-thinking directive, answer extraction may see the reasoning.
-    EXTRA_EVALUATOR_KWARGS = dict(
-        apply_chat_template=True,
-        is_multimodal=True,
-    )
-
-    # NOTE: MMMU adds <|endoftext|> to the stop token.
-    sampling_params = SamplingParams(
-        max_tokens=MMMU.MAX_OUTPUT_LEN,
-        truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
-        stop="<|endoftext|>",
-        temperature=0.0,
-        top_k=1,
-    )
-    MMMU_TASK_SPEC = (MMMU, sampling_params, EXTRA_EVALUATOR_KWARGS)
-
-    voxpopuli_sampling_params = SamplingParams(
-        max_tokens=512,
-        truncate_prompt_tokens=VoxPopuli.MAX_INPUT_LEN,
-        temperature=0.0,
-        top_k=1,
-    )
-    no_thinking_evaluator_kwargs = {
-        # We explicitly disable thinking, because otherwise the thinking traces could
-        # be absurdly long (20k+ tokens), which is not helpful for test-runtime, nor
-        # for reproducibility (the more tokens there are, the higher likelihood of the
-        # end output not being the same).
-        # In addition, if reasoning is cut off, then the WER goes through the roof,
-        # since each word in the output is treated as an error.
-        "chat_template_kwargs": {"enable_thinking": False},
-    }
-    VOXPOPULI_TASK_SPEC = (
-        VoxPopuli,
-        voxpopuli_sampling_params,
-        no_thinking_evaluator_kwargs,
-    )
-
-    videomme_sampling_params = SamplingParams(
-        max_tokens=VideoMME.MAX_OUTPUT_LEN,
-        truncate_prompt_tokens=VideoMME.MAX_INPUT_LEN,
-        temperature=0.0,
-        top_k=1,
-    )
-    VIDEOMME_TASK_SPEC = (
-        VideoMME,
-        videomme_sampling_params,
-        no_thinking_evaluator_kwargs,
-    )
-
     @pytest.mark.skip_less_device_memory(80000)
     @pytest.mark.parametrize(
         (
             "model_name,model_path,kv_cache_config,max_batch_size,"
             "expected_quant_algo,task_specs,multimodal_config"
         ),
         [
-            pytest.param(
-                "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16",
-                f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16",
-                KvCacheConfig(
-                    free_gpu_memory_fraction=0.8,
-                    mamba_ssm_cache_dtype="float32",
-                    enable_block_reuse=False,
-                ),
-                32,
-                None,
-                (MMMU_TASK_SPEC,),
-                None,
-                id="bf16",
-            ),
-            pytest.param(
-                "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8",
-                f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8",
-                KvCacheConfig(
-                    free_gpu_memory_fraction=0.8,
-                    mamba_ssm_cache_dtype="float32",
-                    enable_block_reuse=False,
-                    dtype="fp8",
-                ),
-                64,
-                QuantAlgo.FP8,
-                (MMMU_TASK_SPEC, VOXPOPULI_TASK_SPEC, VIDEOMME_TASK_SPEC),
-                None,
-                marks=skip_pre_hopper,
-                id="fp8",
-            ),
-            pytest.param(
-                "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8",
-                f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8",
-                KvCacheConfig(
-                    free_gpu_memory_fraction=0.8,
-                    mamba_ssm_cache_dtype="float32",
-                    enable_block_reuse=False,
-                    dtype="fp8",
-                ),
-                64,
-                QuantAlgo.FP8,
-                (MMMU_TASK_SPEC,),
-                MultimodalConfig(
-                    encoder_cuda_graph={
-                        "vision": MultimodalEncoderCudaGraphConfig(
-                            # Uncomment to debug (logs will show hits / misses), which is how the
-                            # below buckets were determined.
-                            # enable_replay_stats=True,
-                            buckets=[
-                                (1280, 1),
-                                (4096, 1),
-                                (5500, 2),
-                            ],
-                        )
-                    },
-                ),
-                marks=skip_pre_hopper,
-                id="fp8_mmmu_encoder_cuda_graph",
-            ),
+            # TEMPORARY: duplicate the flaky NVFP4 case for B200 CI iteration.
             pytest.param(
                 "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
                 f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
@@ -740,13 +683,14 @@ class TestNanoV3Omni(LlmapiAccuracyTestHarness):
                     enable_block_reuse=False,
                     dtype="fp8",
                 ),
-                128,
+                64,
                 QuantAlgo.MIXED_PRECISION,
                 (MMMU_TASK_SPEC, VOXPOPULI_TASK_SPEC, VIDEOMME_TASK_SPEC),
                 None,
                 marks=(skip_pre_blackwell,),
-                id="nvfp4",
-            ),
+                id=f"nvfp4_repeat_{i}",
+            )
+            for i in range(1, 11)
         ],
     )
     # `torch.compile` uses a thread pool to compile and it's used in audio pre-processing.