NVIDIA · 2ez4bz · Jun 23, 2026 · Jun 23, 2026
@@ -4467,15 +4467,7 @@ def launchTestJobs(pipeline, testFilter)
         "DGX_H100-4_GPUs-PyTorch-Ray-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
         "DGX_H100-4_GPUs-AutoDeploy-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
         "DGX_H100-4_GPUs-AutoDeploy-Post-Merge-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
-        "DGX_B200-PyTorch-1": ["auto:dgx-b200-flex", "l0_b200", 1, 9, 1, 1, true],
-        "DGX_B200-PyTorch-2": ["auto:dgx-b200-flex", "l0_b200", 2, 9, 1, 1, true],
-        "DGX_B200-PyTorch-3": ["auto:dgx-b200-flex", "l0_b200", 3, 9, 1, 1, true],
-        "DGX_B200-PyTorch-4": ["auto:dgx-b200-flex", "l0_b200", 4, 9, 1, 1, true],
-        "DGX_B200-PyTorch-5": ["auto:dgx-b200-flex", "l0_b200", 5, 9, 1, 1, true],
-        "DGX_B200-PyTorch-6": ["auto:dgx-b200-flex", "l0_b200", 6, 9, 1, 1, true],
-        "DGX_B200-PyTorch-7": ["auto:dgx-b200-flex", "l0_b200", 7, 9, 1, 1, true],
-        "DGX_B200-PyTorch-8": ["auto:dgx-b200-flex", "l0_b200", 8, 9, 1, 1, true],
-        "DGX_B200-PyTorch-9": ["auto:dgx-b200-flex", "l0_b200", 9, 9, 1, 1, true],
+        "DGX_B200-PyTorch-1": ["auto:dgx-b200-flex", "l0_b200", 1, 1, 1, 1, true],
         "DGX_B200-AutoDeploy-1": ["auto:dgx-b200-flex", "l0_b200", 1, 1, 1, 1, true],
         "DGX_B200-Triton-Post-Merge-1": ["auto:dgx-b200-flex", "l0_b200", 1, 1, 1, 1, true],
         "DGX_B200-PyTorch-Post-Merge-1": ["auto:dgx-b200-flex", "l0_b200", 1, 2, 1, 1, true],

diff --git a/scripts/check_test_list.py b/scripts/check_test_list.py
@@ -720,6 +720,11 @@ def verify_waive_list(llm_src, args):
     with open(tmp_waives_file, "w") as f:
         f.writelines(f"{line}\n" for line in sorted(processed_lines))
 
+    if not processed_lines:
+        print("No integration waive entries found; skipping collection.",
+              flush=True)
+        return
+
     subprocess.run(
         f"cd {llm_src}/tests/integration/defs && "
         f"pytest --test-list={tmp_waives_file} --output-dir={llm_src} -s --co -q",

diff --git a/tensorrt_llm/evaluate/audio_asr.py b/tensorrt_llm/evaluate/audio_asr.py
@@ -20,7 +20,6 @@
 from typing import Any, Iterable, NamedTuple, Optional
 
 import soundfile
-from tqdm import tqdm
 
 import tensorrt_llm.profiler as profiler
 from tensorrt_llm.inputs import (
@@ -36,7 +35,13 @@
 from tensorrt_llm.logger import logger
 from tensorrt_llm.sampling_params import SamplingParams
 
-from .interface import Evaluator, get_chat_template_kwargs, get_model_context
+from .interface import (
+    RESULT_WAIT_TIMEOUT_SECS,
+    Evaluator,
+    get_chat_template_kwargs,
+    get_model_context,
+)
+from .progress import tqdm_with_time_prefix
 
 
 class MultimodalASRSample(NamedTuple):
@@ -174,15 +179,19 @@ def evaluate(
         input_context = self._make_input_context(llm)
         dataset = _load_local_hf_dataset(self.dataset_path, self.split)
         num_samples = self._get_num_samples(dataset)
-        samples = list(tqdm(self._iter_samples(dataset), desc="Loading samples", total=num_samples))
+        samples = list(
+            tqdm_with_time_prefix(
+                self._iter_samples(dataset), desc="Loading samples", total=num_samples
+            )
+        )
         inputs = [
             self._make_input(llm, sample, input_context)
-            for sample in tqdm(samples, desc="Loading inputs")
+            for sample in tqdm_with_time_prefix(samples, desc="Loading inputs")
         ]
         futures = []
         references = []
         scoring_samples = []
-        for sample, request_input in tqdm(
+        for sample, request_input in tqdm_with_time_prefix(
             zip(samples, inputs, strict=True), desc="Submitting requests", total=len(samples)
         ):
             params = (
@@ -197,7 +206,10 @@ def evaluate(
             )
             references.append(sample.transcript)
             scoring_samples.append(_sample_for_scoring(sample))
-        outputs = [future.result() for future in tqdm(futures, desc="Fetching responses")]
+        outputs = [
+            future.result(timeout=RESULT_WAIT_TIMEOUT_SECS)
+            for future in tqdm_with_time_prefix(futures, desc="Fetching responses")
+        ]
 
         profiler.stop("trtllm exec")
         elapsed_time = profiler.elapsed_time_in_sec("trtllm exec")

diff --git a/tensorrt_llm/evaluate/interface.py b/tensorrt_llm/evaluate/interface.py
@@ -29,6 +29,15 @@
 from ..logger import logger
 from ..sampling_params import SamplingParams
 
+# Per-request upper bound (seconds) on how long an evaluator waits for a single response before
+# failing fast. A stalled or dead executor worker would otherwise block `future.result()`
+# indefinitely, turning an evaluation into potential hangs.
+# This is a backstop: it is intentionally larger than the executor's stall watchdog
+# (`TLLM_EXECUTOR_STALL_TIMEOUT_SECS`, default 300s) so the watchdog's more-informative
+# `RequestError` normally surfaces first; no healthy single request should come close to it.
+RESULT_WAIT_TIMEOUT_SECS = float(
+    os.environ.get("TLLM_EVAL_RESULT_TIMEOUT_SECS", "600"))
+
 
 def get_chat_template_kwargs(
         template_owner: Any,
@@ -145,7 +154,7 @@ def evaluate(self,
             auxiliaries.append(aux)
         results = []
         for output in tqdm(outputs, desc="Fetching responses"):
-            results.append(output.result())
+            results.append(output.result(timeout=RESULT_WAIT_TIMEOUT_SECS))
 
         if self.output_dir:
             dump_inference_results(self.output_dir, results,

diff --git a/tensorrt_llm/evaluate/lm_eval.py b/tensorrt_llm/evaluate/lm_eval.py
@@ -21,7 +21,6 @@
 
 import click
 import numpy as np
-from tqdm import tqdm
 
 import tensorrt_llm.profiler as profiler
 from tensorrt_llm.inputs import prompt_inputs
@@ -44,8 +43,9 @@
 from ..llmapi import RequestOutput
 from ..logger import logger
 from ..sampling_params import SamplingParams
-from .interface import (Evaluator, dump_inference_results,
-                        get_chat_template_kwargs)
+from .interface import (RESULT_WAIT_TIMEOUT_SECS, Evaluator,
+                        dump_inference_results, get_chat_template_kwargs)
+from .progress import tqdm_with_time_prefix
 
 # NOTE: lm_eval uses "<image>" as the default image placeholder
 # https://github.com/EleutherAI/lm-evaluation-harness/blob/7f04db12d2f8e7a99a0830d99eb78130e1ba2122/lm_eval/models/hf_vlms.py#L25
@@ -162,9 +162,9 @@ def _get_sampling_params(self, gen_kwargs: dict) -> SamplingParams:
     def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
         profiler.start("trtllm exec")
         results = []
-        for request in tqdm(requests,
-                            desc="Submitting requests",
-                            disable=disable_tqdm):
+        for request in tqdm_with_time_prefix(requests,
+                                             desc="Submitting requests",
+                                             disable=disable_tqdm):
             prompt, gen_kwargs = request.args
             sampling_params = self._get_sampling_params(gen_kwargs)
             output = self.llm.generate_async(prompt,
@@ -173,10 +173,10 @@ def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
             results.append(output)
 
         outputs = []
-        for output in tqdm(results,
-                           desc="Fetching responses",
-                           disable=disable_tqdm):
-            outputs.append(output.result())
+        for output in tqdm_with_time_prefix(results,
+                                            desc="Fetching responses",
+                                            disable=disable_tqdm):
+            outputs.append(output.result(timeout=RESULT_WAIT_TIMEOUT_SECS))
 
         if self.output_dir:
             dump_inference_results(self.output_dir, outputs,
@@ -405,9 +405,9 @@ def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
         """
         profiler.start("trtllm exec")
         results = []
-        for request in tqdm(requests,
-                            desc="Submitting requests",
-                            disable=disable_tqdm):
+        for request in tqdm_with_time_prefix(requests,
+                                             desc="Submitting requests",
+                                             disable=disable_tqdm):
 
             # NOTE: For now, only this part is different from the original generate_until
             prompt, gen_kwargs, media_data = request.args
@@ -431,10 +431,10 @@ def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
             results.append(output)
 
         outputs = []
-        for output in tqdm(results,
-                           desc="Fetching responses",
-                           disable=disable_tqdm):
-            outputs.append(output.result())
+        for output in tqdm_with_time_prefix(results,
+                                            desc="Fetching responses",
+                                            disable=disable_tqdm):
+            outputs.append(output.result(timeout=RESULT_WAIT_TIMEOUT_SECS))
 
         if self.output_dir:
             dump_inference_results(self.output_dir, outputs,

diff --git a/tensorrt_llm/evaluate/progress.py b/tensorrt_llm/evaluate/progress.py
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from datetime import datetime
+from typing import Any
+
+from tqdm import tqdm
+
+_TIME_PREFIX_BAR_FORMAT = "{current_time} {l_bar}{bar}{r_bar}"
+
+
+class _TimePrefixTqdm(tqdm):
+    @property
+    def format_dict(self) -> dict[str, Any]:
+        format_dict = super().format_dict
+        format_dict["current_time"] = datetime.now().strftime("%H:%M:%S")
+        return format_dict
+
+
+def tqdm_with_time_prefix(*args: Any, **kwargs: Any) -> _TimePrefixTqdm:
+    """Return a tqdm progress bar with the current time rendered before the description."""
+    kwargs.setdefault("bar_format", _TIME_PREFIX_BAR_FORMAT)
+    return _TimePrefixTqdm(*args, **kwargs)
@@ -990,7 +990,12 @@ def _handle_ray_response(self, response: Any):
         return response
 
     def _result_step(self, timeout: Optional[float] = None):
-        response = self.queue.get()
+        try:
+            response = self.queue.get(timeout=timeout)
+        except Empty:
+            raise TimeoutError(
+                f"Request {self.request_id} timed out after {timeout}s "
+                f"waiting for a response from the executor worker.")
         self._handle_response(response)
 
     async def _aresult_step(self):

@@ -605,66 +605,67 @@ def test_auto_dtype(self, max_num_tokens):
             task.evaluate(llm, sampling_params=self.sampling_params)
 
 
+# The score here may be lower than VLMEvalKitMcore (official) runs. This path uses
+# lm_eval's MMMU task, prompt formatting, and scoring, while VLMEvalKitMcore
+# uses MMMU_DEV_VAL with its own MCQ prompt builder, answer extraction, and
+# explicit image tiling/token accounting in the Mcore wrapper.
+# We also keep the generation budget small for CI speed, and this evaluator
+# does not strip reasoning traces after </think> before scoring. If the model
+# ignores the non-thinking directive, answer extraction may see the reasoning.
+EXTRA_EVALUATOR_KWARGS = dict(
+    apply_chat_template=True,
+    is_multimodal=True,
+)
+
+# NOTE: MMMU adds <|endoftext|> to the stop token.
+sampling_params = SamplingParams(
+    max_tokens=MMMU.MAX_OUTPUT_LEN,
+    truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
+    stop="<|endoftext|>",
+    temperature=0.0,
+    top_k=1,
+)
+MMMU_TASK_SPEC = (MMMU, sampling_params, EXTRA_EVALUATOR_KWARGS)
+
+voxpopuli_sampling_params = SamplingParams(
+    max_tokens=512,
+    truncate_prompt_tokens=VoxPopuli.MAX_INPUT_LEN,
+    temperature=0.0,
+    top_k=1,
+)
+no_thinking_evaluator_kwargs = {
+    # We explicitly disable thinking, because otherwise the thinking traces could
+    # be absurdly long (20k+ tokens), which is not helpful for test-runtime, nor
+    # for reproducibility (the more tokens there are, the higher likelihood of the
+    # end output not being the same).
+    # In addition, if reasoning is cut off, then the WER goes through the roof,
+    # since each word in the output is treated as an error.
+    "chat_template_kwargs": {"enable_thinking": False},
+}
+VOXPOPULI_TASK_SPEC = (
+    VoxPopuli,
+    voxpopuli_sampling_params,
+    no_thinking_evaluator_kwargs,
+)
+
+videomme_sampling_params = SamplingParams(
+    max_tokens=VideoMME.MAX_OUTPUT_LEN,
+    truncate_prompt_tokens=VideoMME.MAX_INPUT_LEN,
+    temperature=0.0,
+    top_k=1,
+)
+VIDEOMME_TASK_SPEC = (
+    VideoMME,
+    videomme_sampling_params,
+    no_thinking_evaluator_kwargs,
+)
+
+
 # Skip for B300 / GB300:
 # * B300 coverage does not meaningfully extend what we test via B200.
 # * GB300 may not be entirely up to date for `llm-models`, leading to repo-wide CI errors.
 @skip_post_blackwell_ultra
 class TestNanoV3Omni(LlmapiAccuracyTestHarness):
-    # The score here may be lower than VLMEvalKitMcore (official) runs. This path uses
-    # lm_eval's MMMU task, prompt formatting, and scoring, while VLMEvalKitMcore
-    # uses MMMU_DEV_VAL with its own MCQ prompt builder, answer extraction, and
-    # explicit image tiling/token accounting in the Mcore wrapper.
-    # We also keep the generation budget small for CI speed, and this evaluator
-    # does not strip reasoning traces after </think> before scoring. If the model
-    # ignores the non-thinking directive, answer extraction may see the reasoning.
-    EXTRA_EVALUATOR_KWARGS = dict(
-        apply_chat_template=True,
-        is_multimodal=True,
-    )
-
-    # NOTE: MMMU adds <|endoftext|> to the stop token.
-    sampling_params = SamplingParams(
-        max_tokens=MMMU.MAX_OUTPUT_LEN,
-        truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
-        stop="<|endoftext|>",
-        temperature=0.0,
-        top_k=1,
-    )
-    MMMU_TASK_SPEC = (MMMU, sampling_params, EXTRA_EVALUATOR_KWARGS)
-
-    voxpopuli_sampling_params = SamplingParams(
-        max_tokens=512,
-        truncate_prompt_tokens=VoxPopuli.MAX_INPUT_LEN,
-        temperature=0.0,
-        top_k=1,
-    )
-    no_thinking_evaluator_kwargs = {
-        # We explicitly disable thinking, because otherwise the thinking traces could
-        # be absurdly long (20k+ tokens), which is not helpful for test-runtime, nor
-        # for reproducibility (the more tokens there are, the higher likelihood of the
-        # end output not being the same).
-        # In addition, if reasoning is cut off, then the WER goes through the roof,
-        # since each word in the output is treated as an error.
-        "chat_template_kwargs": {"enable_thinking": False},
-    }
-    VOXPOPULI_TASK_SPEC = (
-        VoxPopuli,
-        voxpopuli_sampling_params,
-        no_thinking_evaluator_kwargs,
-    )
-
-    videomme_sampling_params = SamplingParams(
-        max_tokens=VideoMME.MAX_OUTPUT_LEN,
-        truncate_prompt_tokens=VideoMME.MAX_INPUT_LEN,
-        temperature=0.0,
-        top_k=1,
-    )
-    VIDEOMME_TASK_SPEC = (
-        VideoMME,
-        videomme_sampling_params,
-        no_thinking_evaluator_kwargs,
-    )
-
     @pytest.mark.skip_less_device_memory(80000)
     @pytest.mark.parametrize(
         (
@@ -747,6 +748,26 @@ class TestNanoV3Omni(LlmapiAccuracyTestHarness):
                 marks=(skip_pre_blackwell,),
                 id="nvfp4",
             ),
+        ]
+        + [
+            # TEMPORARY: duplicate the flaky NVFP4 case for B200 CI iteration.
+            pytest.param(
+                "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
+                f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
+                KvCacheConfig(
+                    free_gpu_memory_fraction=0.8,
+                    mamba_ssm_cache_dtype="float32",
+                    enable_block_reuse=False,
+                    dtype="fp8",
+                ),
+                64,
+                QuantAlgo.MIXED_PRECISION,
+                (MMMU_TASK_SPEC, VOXPOPULI_TASK_SPEC, VIDEOMME_TASK_SPEC),
+                None,
+                marks=(skip_pre_blackwell,),
+                id=f"nvfp4_repeat_{i}",
+            )
+            for i in range(1, 11)
         ],
     )
     # `torch.compile` uses a thread pool to compile and it's used in audio pre-processing.