[https://nvbugs/6336747][ci] Fix test timeouts

2ez4bz · 2ez4bz · commit 5b09793369e8 · 2026-06-24T22:36:10.000-07:00
Signed-off-by: William Zhang &lt;133824995+2ez4bz@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/evaluate/audio_asr.py b/tensorrt_llm/evaluate/audio_asr.py
@@ -20,7 +20,6 @@
 from typing import Any, Iterable, NamedTuple, Optional
 
 import soundfile
-from tqdm import tqdm
 
 import tensorrt_llm.profiler as profiler
 from tensorrt_llm.inputs import (
@@ -36,7 +35,13 @@
 from tensorrt_llm.logger import logger
 from tensorrt_llm.sampling_params import SamplingParams
 
-from .interface import Evaluator, get_chat_template_kwargs, get_model_context
+from .interface import (
+    RESULT_WAIT_TIMEOUT_SECS,
+    Evaluator,
+    get_chat_template_kwargs,
+    get_model_context,
+)
+from .progress import tqdm_with_time_prefix
 
 
 class MultimodalASRSample(NamedTuple):
@@ -174,15 +179,19 @@ def evaluate(
         input_context = self._make_input_context(llm)
         dataset = _load_local_hf_dataset(self.dataset_path, self.split)
         num_samples = self._get_num_samples(dataset)
-        samples = list(tqdm(self._iter_samples(dataset), desc="Loading samples", total=num_samples))
+        samples = list(
+            tqdm_with_time_prefix(
+                self._iter_samples(dataset), desc="Loading samples", total=num_samples
+            )
+        )
         inputs = [
             self._make_input(llm, sample, input_context)
-            for sample in tqdm(samples, desc="Loading inputs")
+            for sample in tqdm_with_time_prefix(samples, desc="Loading inputs")
         ]
         futures = []
         references = []
         scoring_samples = []
-        for sample, request_input in tqdm(
+        for sample, request_input in tqdm_with_time_prefix(
             zip(samples, inputs, strict=True), desc="Submitting requests", total=len(samples)
         ):
             params = (
@@ -197,7 +206,10 @@ def evaluate(
             )
             references.append(sample.transcript)
             scoring_samples.append(_sample_for_scoring(sample))
-        outputs = [future.result() for future in tqdm(futures, desc="Fetching responses")]
+        outputs = [
+            future.result(timeout=RESULT_WAIT_TIMEOUT_SECS)
+            for future in tqdm_with_time_prefix(futures, desc="Fetching responses")
+        ]
 
         profiler.stop("trtllm exec")
         elapsed_time = profiler.elapsed_time_in_sec("trtllm exec")
diff --git a/tensorrt_llm/evaluate/interface.py b/tensorrt_llm/evaluate/interface.py
@@ -29,6 +29,15 @@
 from ..logger import logger
 from ..sampling_params import SamplingParams
 
+# Per-request upper bound (seconds) on how long an evaluator waits for a single response before
+# failing fast. A stalled or dead executor worker would otherwise block `future.result()`
+# indefinitely, turning an evaluation into potential hangs.
+# This is a backstop: it is intentionally larger than the executor's stall watchdog
+# (`TLLM_EXECUTOR_STALL_TIMEOUT_SECS`, default 300s) so the watchdog's more-informative
+# `RequestError` normally surfaces first; no healthy single request should come close to it.
+RESULT_WAIT_TIMEOUT_SECS = float(
+    os.environ.get("TLLM_EVAL_RESULT_TIMEOUT_SECS", "600"))
+
 
 def get_chat_template_kwargs(
         template_owner: Any,
@@ -145,7 +154,7 @@ def evaluate(self,
             auxiliaries.append(aux)
         results = []
         for output in tqdm(outputs, desc="Fetching responses"):
-            results.append(output.result())
+            results.append(output.result(timeout=RESULT_WAIT_TIMEOUT_SECS))
 
         if self.output_dir:
             dump_inference_results(self.output_dir, results,
diff --git a/tensorrt_llm/evaluate/lm_eval.py b/tensorrt_llm/evaluate/lm_eval.py
@@ -21,7 +21,6 @@
 
 import click
 import numpy as np
-from tqdm import tqdm
 
 import tensorrt_llm.profiler as profiler
 from tensorrt_llm.inputs import prompt_inputs
@@ -44,8 +43,9 @@
 from ..llmapi import RequestOutput
 from ..logger import logger
 from ..sampling_params import SamplingParams
-from .interface import (Evaluator, dump_inference_results,
-                        get_chat_template_kwargs)
+from .interface import (RESULT_WAIT_TIMEOUT_SECS, Evaluator,
+                        dump_inference_results, get_chat_template_kwargs)
+from .progress import tqdm_with_time_prefix
 
 # NOTE: lm_eval uses "<image>" as the default image placeholder
 # https://github.com/EleutherAI/lm-evaluation-harness/blob/7f04db12d2f8e7a99a0830d99eb78130e1ba2122/lm_eval/models/hf_vlms.py#L25
@@ -162,9 +162,9 @@ def _get_sampling_params(self, gen_kwargs: dict) -> SamplingParams:
     def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
         profiler.start("trtllm exec")
         results = []
-        for request in tqdm(requests,
-                            desc="Submitting requests",
-                            disable=disable_tqdm):
+        for request in tqdm_with_time_prefix(requests,
+                                             desc="Submitting requests",
+                                             disable=disable_tqdm):
             prompt, gen_kwargs = request.args
             sampling_params = self._get_sampling_params(gen_kwargs)
             output = self.llm.generate_async(prompt,
@@ -173,10 +173,10 @@ def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
             results.append(output)
 
         outputs = []
-        for output in tqdm(results,
-                           desc="Fetching responses",
-                           disable=disable_tqdm):
-            outputs.append(output.result())
+        for output in tqdm_with_time_prefix(results,
+                                            desc="Fetching responses",
+                                            disable=disable_tqdm):
+            outputs.append(output.result(timeout=RESULT_WAIT_TIMEOUT_SECS))
 
         if self.output_dir:
             dump_inference_results(self.output_dir, outputs,
@@ -405,9 +405,9 @@ def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
         """
         profiler.start("trtllm exec")
         results = []
-        for request in tqdm(requests,
-                            desc="Submitting requests",
-                            disable=disable_tqdm):
+        for request in tqdm_with_time_prefix(requests,
+                                             desc="Submitting requests",
+                                             disable=disable_tqdm):
 
             # NOTE: For now, only this part is different from the original generate_until
             prompt, gen_kwargs, media_data = request.args
@@ -431,10 +431,10 @@ def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
             results.append(output)
 
         outputs = []
-        for output in tqdm(results,
-                           desc="Fetching responses",
-                           disable=disable_tqdm):
-            outputs.append(output.result())
+        for output in tqdm_with_time_prefix(results,
+                                            desc="Fetching responses",
+                                            disable=disable_tqdm):
+            outputs.append(output.result(timeout=RESULT_WAIT_TIMEOUT_SECS))
 
         if self.output_dir:
             dump_inference_results(self.output_dir, outputs,
diff --git a/tensorrt_llm/evaluate/progress.py b/tensorrt_llm/evaluate/progress.py
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from datetime import datetime
+from typing import Any
+
+from tqdm import tqdm
+
+_TIME_PREFIX_BAR_FORMAT = "{current_time} {l_bar}{bar}{r_bar}"
+
+
+class _TimePrefixTqdm(tqdm):
+    @property
+    def format_dict(self) -> dict[str, Any]:
+        format_dict = super().format_dict
+        format_dict["current_time"] = datetime.now().strftime("%H:%M:%S")
+        return format_dict
+
+
+def tqdm_with_time_prefix(*args: Any, **kwargs: Any) -> _TimePrefixTqdm:
+    """Return a tqdm progress bar with the current time rendered before the description."""
+    kwargs.setdefault("bar_format", _TIME_PREFIX_BAR_FORMAT)
+    return _TimePrefixTqdm(*args, **kwargs)
diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py
@@ -990,7 +990,12 @@ def _handle_ray_response(self, response: Any):
         return response
 
     def _result_step(self, timeout: Optional[float] = None):
-        response = self.queue.get()
+        try:
+            response = self.queue.get(timeout=timeout)
+        except Empty:
+            raise TimeoutError(
+                f"Request {self.request_id} timed out after {timeout}s "
+                f"waiting for a response from the executor worker.")
         self._handle_response(response)
 
     async def _aresult_step(self):
diff --git a/tests/integration/defs/accuracy/video_mme.py b/tests/integration/defs/accuracy/video_mme.py
@@ -18,15 +18,15 @@
 from pathlib import Path
 from typing import Any, Iterable, NamedTuple, Optional
 
-from tqdm import tqdm
-
 import tensorrt_llm.profiler as profiler
 from tensorrt_llm.evaluate.interface import (
+    RESULT_WAIT_TIMEOUT_SECS,
     Evaluator,
     dump_inference_results,
     get_chat_template_kwargs,
     get_model_context,
 )
+from tensorrt_llm.evaluate.progress import tqdm_with_time_prefix
 from tensorrt_llm.inputs import (
     ConversationMessage,
     MultimodalData,
@@ -132,15 +132,19 @@ def evaluate(
     ) -> float:
         profiler.start("trtllm exec")
         input_context = self._make_input_context(llm)
-        samples = list(tqdm(self._iter_samples(), desc="Loading samples", total=self.num_samples))
+        samples = list(
+            tqdm_with_time_prefix(
+                self._iter_samples(), desc="Loading samples", total=self.num_samples
+            )
+        )
         video_cache: dict[str, Any] = {}
         inputs = [
             self._make_input(llm, sample, input_context, video_cache)
-            for sample in tqdm(samples, desc="Loading inputs")
+            for sample in tqdm_with_time_prefix(samples, desc="Loading inputs")
         ]
 
         futures = []
-        for request_input in tqdm(inputs, desc="Submitting requests"):
+        for request_input in tqdm_with_time_prefix(inputs, desc="Submitting requests"):
             params = (
                 copy.deepcopy(sampling_params) if sampling_params is not None else SamplingParams()
             )
@@ -151,7 +155,12 @@ def evaluate(
                     streaming=streaming,
                 )
             )
-        outputs = [future.result() for future in tqdm(futures, desc="Fetching responses")]
+        # Bound the per-request wait so a stalled/dead worker fails the test fast instead of hanging
+        # until the outer CI timeout. No healthy single request should come close to this budget.
+        outputs = [
+            future.result(timeout=RESULT_WAIT_TIMEOUT_SECS)
+            for future in tqdm_with_time_prefix(futures, desc="Fetching responses")
+        ]
 
         if self.output_dir:
             dump_inference_results(self.output_dir, outputs, getattr(llm, "tokenizer", None))