Skip to content

Commit 5b09793

Browse files
committed
[https://nvbugs/6336747][ci] Fix test timeouts
Signed-off-by: William Zhang <133824995+2ez4bz@users.noreply.github.com>
1 parent bcb9441 commit 5b09793

6 files changed

Lines changed: 100 additions & 31 deletions

File tree

tensorrt_llm/evaluate/audio_asr.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
from typing import Any, Iterable, NamedTuple, Optional
2121

2222
import soundfile
23-
from tqdm import tqdm
2423

2524
import tensorrt_llm.profiler as profiler
2625
from tensorrt_llm.inputs import (
@@ -36,7 +35,13 @@
3635
from tensorrt_llm.logger import logger
3736
from tensorrt_llm.sampling_params import SamplingParams
3837

39-
from .interface import Evaluator, get_chat_template_kwargs, get_model_context
38+
from .interface import (
39+
RESULT_WAIT_TIMEOUT_SECS,
40+
Evaluator,
41+
get_chat_template_kwargs,
42+
get_model_context,
43+
)
44+
from .progress import tqdm_with_time_prefix
4045

4146

4247
class MultimodalASRSample(NamedTuple):
@@ -174,15 +179,19 @@ def evaluate(
174179
input_context = self._make_input_context(llm)
175180
dataset = _load_local_hf_dataset(self.dataset_path, self.split)
176181
num_samples = self._get_num_samples(dataset)
177-
samples = list(tqdm(self._iter_samples(dataset), desc="Loading samples", total=num_samples))
182+
samples = list(
183+
tqdm_with_time_prefix(
184+
self._iter_samples(dataset), desc="Loading samples", total=num_samples
185+
)
186+
)
178187
inputs = [
179188
self._make_input(llm, sample, input_context)
180-
for sample in tqdm(samples, desc="Loading inputs")
189+
for sample in tqdm_with_time_prefix(samples, desc="Loading inputs")
181190
]
182191
futures = []
183192
references = []
184193
scoring_samples = []
185-
for sample, request_input in tqdm(
194+
for sample, request_input in tqdm_with_time_prefix(
186195
zip(samples, inputs, strict=True), desc="Submitting requests", total=len(samples)
187196
):
188197
params = (
@@ -197,7 +206,10 @@ def evaluate(
197206
)
198207
references.append(sample.transcript)
199208
scoring_samples.append(_sample_for_scoring(sample))
200-
outputs = [future.result() for future in tqdm(futures, desc="Fetching responses")]
209+
outputs = [
210+
future.result(timeout=RESULT_WAIT_TIMEOUT_SECS)
211+
for future in tqdm_with_time_prefix(futures, desc="Fetching responses")
212+
]
201213

202214
profiler.stop("trtllm exec")
203215
elapsed_time = profiler.elapsed_time_in_sec("trtllm exec")

tensorrt_llm/evaluate/interface.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,15 @@
2929
from ..logger import logger
3030
from ..sampling_params import SamplingParams
3131

32+
# Per-request upper bound (seconds) on how long an evaluator waits for a single response before
33+
# failing fast. A stalled or dead executor worker would otherwise block `future.result()`
34+
# indefinitely, turning an evaluation into potential hangs.
35+
# This is a backstop: it is intentionally larger than the executor's stall watchdog
36+
# (`TLLM_EXECUTOR_STALL_TIMEOUT_SECS`, default 300s) so the watchdog's more-informative
37+
# `RequestError` normally surfaces first; no healthy single request should come close to it.
38+
RESULT_WAIT_TIMEOUT_SECS = float(
39+
os.environ.get("TLLM_EVAL_RESULT_TIMEOUT_SECS", "600"))
40+
3241

3342
def get_chat_template_kwargs(
3443
template_owner: Any,
@@ -145,7 +154,7 @@ def evaluate(self,
145154
auxiliaries.append(aux)
146155
results = []
147156
for output in tqdm(outputs, desc="Fetching responses"):
148-
results.append(output.result())
157+
results.append(output.result(timeout=RESULT_WAIT_TIMEOUT_SECS))
149158

150159
if self.output_dir:
151160
dump_inference_results(self.output_dir, results,

tensorrt_llm/evaluate/lm_eval.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
import click
2323
import numpy as np
24-
from tqdm import tqdm
2524

2625
import tensorrt_llm.profiler as profiler
2726
from tensorrt_llm.inputs import prompt_inputs
@@ -44,8 +43,9 @@
4443
from ..llmapi import RequestOutput
4544
from ..logger import logger
4645
from ..sampling_params import SamplingParams
47-
from .interface import (Evaluator, dump_inference_results,
48-
get_chat_template_kwargs)
46+
from .interface import (RESULT_WAIT_TIMEOUT_SECS, Evaluator,
47+
dump_inference_results, get_chat_template_kwargs)
48+
from .progress import tqdm_with_time_prefix
4949

5050
# NOTE: lm_eval uses "<image>" as the default image placeholder
5151
# https://github.com/EleutherAI/lm-evaluation-harness/blob/7f04db12d2f8e7a99a0830d99eb78130e1ba2122/lm_eval/models/hf_vlms.py#L25
@@ -162,9 +162,9 @@ def _get_sampling_params(self, gen_kwargs: dict) -> SamplingParams:
162162
def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
163163
profiler.start("trtllm exec")
164164
results = []
165-
for request in tqdm(requests,
166-
desc="Submitting requests",
167-
disable=disable_tqdm):
165+
for request in tqdm_with_time_prefix(requests,
166+
desc="Submitting requests",
167+
disable=disable_tqdm):
168168
prompt, gen_kwargs = request.args
169169
sampling_params = self._get_sampling_params(gen_kwargs)
170170
output = self.llm.generate_async(prompt,
@@ -173,10 +173,10 @@ def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
173173
results.append(output)
174174

175175
outputs = []
176-
for output in tqdm(results,
177-
desc="Fetching responses",
178-
disable=disable_tqdm):
179-
outputs.append(output.result())
176+
for output in tqdm_with_time_prefix(results,
177+
desc="Fetching responses",
178+
disable=disable_tqdm):
179+
outputs.append(output.result(timeout=RESULT_WAIT_TIMEOUT_SECS))
180180

181181
if self.output_dir:
182182
dump_inference_results(self.output_dir, outputs,
@@ -405,9 +405,9 @@ def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
405405
"""
406406
profiler.start("trtllm exec")
407407
results = []
408-
for request in tqdm(requests,
409-
desc="Submitting requests",
410-
disable=disable_tqdm):
408+
for request in tqdm_with_time_prefix(requests,
409+
desc="Submitting requests",
410+
disable=disable_tqdm):
411411

412412
# NOTE: For now, only this part is different from the original generate_until
413413
prompt, gen_kwargs, media_data = request.args
@@ -431,10 +431,10 @@ def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
431431
results.append(output)
432432

433433
outputs = []
434-
for output in tqdm(results,
435-
desc="Fetching responses",
436-
disable=disable_tqdm):
437-
outputs.append(output.result())
434+
for output in tqdm_with_time_prefix(results,
435+
desc="Fetching responses",
436+
disable=disable_tqdm):
437+
outputs.append(output.result(timeout=RESULT_WAIT_TIMEOUT_SECS))
438438

439439
if self.output_dir:
440440
dump_inference_results(self.output_dir, outputs,

tensorrt_llm/evaluate/progress.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
from datetime import datetime
16+
from typing import Any
17+
18+
from tqdm import tqdm
19+
20+
_TIME_PREFIX_BAR_FORMAT = "{current_time} {l_bar}{bar}{r_bar}"
21+
22+
23+
class _TimePrefixTqdm(tqdm):
24+
@property
25+
def format_dict(self) -> dict[str, Any]:
26+
format_dict = super().format_dict
27+
format_dict["current_time"] = datetime.now().strftime("%H:%M:%S")
28+
return format_dict
29+
30+
31+
def tqdm_with_time_prefix(*args: Any, **kwargs: Any) -> _TimePrefixTqdm:
32+
"""Return a tqdm progress bar with the current time rendered before the description."""
33+
kwargs.setdefault("bar_format", _TIME_PREFIX_BAR_FORMAT)
34+
return _TimePrefixTqdm(*args, **kwargs)

tensorrt_llm/executor/result.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -990,7 +990,12 @@ def _handle_ray_response(self, response: Any):
990990
return response
991991

992992
def _result_step(self, timeout: Optional[float] = None):
993-
response = self.queue.get()
993+
try:
994+
response = self.queue.get(timeout=timeout)
995+
except Empty:
996+
raise TimeoutError(
997+
f"Request {self.request_id} timed out after {timeout}s "
998+
f"waiting for a response from the executor worker.")
994999
self._handle_response(response)
9951000

9961001
async def _aresult_step(self):

tests/integration/defs/accuracy/video_mme.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,15 @@
1818
from pathlib import Path
1919
from typing import Any, Iterable, NamedTuple, Optional
2020

21-
from tqdm import tqdm
22-
2321
import tensorrt_llm.profiler as profiler
2422
from tensorrt_llm.evaluate.interface import (
23+
RESULT_WAIT_TIMEOUT_SECS,
2524
Evaluator,
2625
dump_inference_results,
2726
get_chat_template_kwargs,
2827
get_model_context,
2928
)
29+
from tensorrt_llm.evaluate.progress import tqdm_with_time_prefix
3030
from tensorrt_llm.inputs import (
3131
ConversationMessage,
3232
MultimodalData,
@@ -132,15 +132,19 @@ def evaluate(
132132
) -> float:
133133
profiler.start("trtllm exec")
134134
input_context = self._make_input_context(llm)
135-
samples = list(tqdm(self._iter_samples(), desc="Loading samples", total=self.num_samples))
135+
samples = list(
136+
tqdm_with_time_prefix(
137+
self._iter_samples(), desc="Loading samples", total=self.num_samples
138+
)
139+
)
136140
video_cache: dict[str, Any] = {}
137141
inputs = [
138142
self._make_input(llm, sample, input_context, video_cache)
139-
for sample in tqdm(samples, desc="Loading inputs")
143+
for sample in tqdm_with_time_prefix(samples, desc="Loading inputs")
140144
]
141145

142146
futures = []
143-
for request_input in tqdm(inputs, desc="Submitting requests"):
147+
for request_input in tqdm_with_time_prefix(inputs, desc="Submitting requests"):
144148
params = (
145149
copy.deepcopy(sampling_params) if sampling_params is not None else SamplingParams()
146150
)
@@ -151,7 +155,12 @@ def evaluate(
151155
streaming=streaming,
152156
)
153157
)
154-
outputs = [future.result() for future in tqdm(futures, desc="Fetching responses")]
158+
# Bound the per-request wait so a stalled/dead worker fails the test fast instead of hanging
159+
# until the outer CI timeout. No healthy single request should come close to this budget.
160+
outputs = [
161+
future.result(timeout=RESULT_WAIT_TIMEOUT_SECS)
162+
for future in tqdm_with_time_prefix(futures, desc="Fetching responses")
163+
]
155164

156165
if self.output_dir:
157166
dump_inference_results(self.output_dir, outputs, getattr(llm, "tokenizer", None))

0 commit comments

Comments
 (0)