Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 1 addition & 9 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -4467,15 +4467,7 @@ def launchTestJobs(pipeline, testFilter)
"DGX_H100-4_GPUs-PyTorch-Ray-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-AutoDeploy-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-AutoDeploy-Post-Merge-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
"DGX_B200-PyTorch-1": ["auto:dgx-b200-flex", "l0_b200", 1, 9, 1, 1, true],
"DGX_B200-PyTorch-2": ["auto:dgx-b200-flex", "l0_b200", 2, 9, 1, 1, true],
"DGX_B200-PyTorch-3": ["auto:dgx-b200-flex", "l0_b200", 3, 9, 1, 1, true],
"DGX_B200-PyTorch-4": ["auto:dgx-b200-flex", "l0_b200", 4, 9, 1, 1, true],
"DGX_B200-PyTorch-5": ["auto:dgx-b200-flex", "l0_b200", 5, 9, 1, 1, true],
"DGX_B200-PyTorch-6": ["auto:dgx-b200-flex", "l0_b200", 6, 9, 1, 1, true],
"DGX_B200-PyTorch-7": ["auto:dgx-b200-flex", "l0_b200", 7, 9, 1, 1, true],
"DGX_B200-PyTorch-8": ["auto:dgx-b200-flex", "l0_b200", 8, 9, 1, 1, true],
"DGX_B200-PyTorch-9": ["auto:dgx-b200-flex", "l0_b200", 9, 9, 1, 1, true],
"DGX_B200-PyTorch-1": ["auto:dgx-b200-flex", "l0_b200", 1, 1, 1, 1, true],
"DGX_B200-AutoDeploy-1": ["auto:dgx-b200-flex", "l0_b200", 1, 1, 1, 1, true],
"DGX_B200-Triton-Post-Merge-1": ["auto:dgx-b200-flex", "l0_b200", 1, 1, 1, 1, true],
"DGX_B200-PyTorch-Post-Merge-1": ["auto:dgx-b200-flex", "l0_b200", 1, 2, 1, 1, true],
Expand Down
5 changes: 5 additions & 0 deletions scripts/check_test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -720,6 +720,11 @@ def verify_waive_list(llm_src, args):
with open(tmp_waives_file, "w") as f:
f.writelines(f"{line}\n" for line in sorted(processed_lines))

if not processed_lines:
print("No integration waive entries found; skipping collection.",
flush=True)
return

subprocess.run(
f"cd {llm_src}/tests/integration/defs && "
f"pytest --test-list={tmp_waives_file} --output-dir={llm_src} -s --co -q",
Expand Down
24 changes: 18 additions & 6 deletions tensorrt_llm/evaluate/audio_asr.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from typing import Any, Iterable, NamedTuple, Optional

import soundfile
from tqdm import tqdm

import tensorrt_llm.profiler as profiler
from tensorrt_llm.inputs import (
Expand All @@ -36,7 +35,13 @@
from tensorrt_llm.logger import logger
from tensorrt_llm.sampling_params import SamplingParams

from .interface import Evaluator, get_chat_template_kwargs, get_model_context
from .interface import (
RESULT_WAIT_TIMEOUT_SECS,
Evaluator,
get_chat_template_kwargs,
get_model_context,
)
from .progress import tqdm_with_time_prefix


class MultimodalASRSample(NamedTuple):
Expand Down Expand Up @@ -174,15 +179,19 @@ def evaluate(
input_context = self._make_input_context(llm)
dataset = _load_local_hf_dataset(self.dataset_path, self.split)
num_samples = self._get_num_samples(dataset)
samples = list(tqdm(self._iter_samples(dataset), desc="Loading samples", total=num_samples))
samples = list(
tqdm_with_time_prefix(
self._iter_samples(dataset), desc="Loading samples", total=num_samples
)
)
inputs = [
self._make_input(llm, sample, input_context)
for sample in tqdm(samples, desc="Loading inputs")
for sample in tqdm_with_time_prefix(samples, desc="Loading inputs")
]
futures = []
references = []
scoring_samples = []
for sample, request_input in tqdm(
for sample, request_input in tqdm_with_time_prefix(
zip(samples, inputs, strict=True), desc="Submitting requests", total=len(samples)
):
params = (
Expand All @@ -197,7 +206,10 @@ def evaluate(
)
references.append(sample.transcript)
scoring_samples.append(_sample_for_scoring(sample))
outputs = [future.result() for future in tqdm(futures, desc="Fetching responses")]
outputs = [
future.result(timeout=RESULT_WAIT_TIMEOUT_SECS)
for future in tqdm_with_time_prefix(futures, desc="Fetching responses")
]

profiler.stop("trtllm exec")
elapsed_time = profiler.elapsed_time_in_sec("trtllm exec")
Expand Down
11 changes: 10 additions & 1 deletion tensorrt_llm/evaluate/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,15 @@
from ..logger import logger
from ..sampling_params import SamplingParams

# Per-request upper bound (seconds) on how long an evaluator waits for a single response before
# failing fast. A stalled or dead executor worker would otherwise block `future.result()`
# indefinitely, turning an evaluation into potential hangs.
# This is a backstop: it is intentionally larger than the executor's stall watchdog
# (`TLLM_EXECUTOR_STALL_TIMEOUT_SECS`, default 300s) so the watchdog's more-informative
# `RequestError` normally surfaces first; no healthy single request should come close to it.
RESULT_WAIT_TIMEOUT_SECS = float(
os.environ.get("TLLM_EVAL_RESULT_TIMEOUT_SECS", "600"))


def get_chat_template_kwargs(
template_owner: Any,
Expand Down Expand Up @@ -145,7 +154,7 @@ def evaluate(self,
auxiliaries.append(aux)
results = []
for output in tqdm(outputs, desc="Fetching responses"):
results.append(output.result())
results.append(output.result(timeout=RESULT_WAIT_TIMEOUT_SECS))

if self.output_dir:
dump_inference_results(self.output_dir, results,
Expand Down
34 changes: 17 additions & 17 deletions tensorrt_llm/evaluate/lm_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

import click
import numpy as np
from tqdm import tqdm

import tensorrt_llm.profiler as profiler
from tensorrt_llm.inputs import prompt_inputs
Expand All @@ -44,8 +43,9 @@
from ..llmapi import RequestOutput
from ..logger import logger
from ..sampling_params import SamplingParams
from .interface import (Evaluator, dump_inference_results,
get_chat_template_kwargs)
from .interface import (RESULT_WAIT_TIMEOUT_SECS, Evaluator,
dump_inference_results, get_chat_template_kwargs)
from .progress import tqdm_with_time_prefix

# NOTE: lm_eval uses "<image>" as the default image placeholder
# https://github.com/EleutherAI/lm-evaluation-harness/blob/7f04db12d2f8e7a99a0830d99eb78130e1ba2122/lm_eval/models/hf_vlms.py#L25
Expand Down Expand Up @@ -162,9 +162,9 @@ def _get_sampling_params(self, gen_kwargs: dict) -> SamplingParams:
def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
profiler.start("trtllm exec")
results = []
for request in tqdm(requests,
desc="Submitting requests",
disable=disable_tqdm):
for request in tqdm_with_time_prefix(requests,
desc="Submitting requests",
disable=disable_tqdm):
prompt, gen_kwargs = request.args
sampling_params = self._get_sampling_params(gen_kwargs)
output = self.llm.generate_async(prompt,
Expand All @@ -173,10 +173,10 @@ def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
results.append(output)

outputs = []
for output in tqdm(results,
desc="Fetching responses",
disable=disable_tqdm):
outputs.append(output.result())
for output in tqdm_with_time_prefix(results,
desc="Fetching responses",
disable=disable_tqdm):
outputs.append(output.result(timeout=RESULT_WAIT_TIMEOUT_SECS))

if self.output_dir:
dump_inference_results(self.output_dir, outputs,
Expand Down Expand Up @@ -405,9 +405,9 @@ def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
"""
profiler.start("trtllm exec")
results = []
for request in tqdm(requests,
desc="Submitting requests",
disable=disable_tqdm):
for request in tqdm_with_time_prefix(requests,
desc="Submitting requests",
disable=disable_tqdm):

# NOTE: For now, only this part is different from the original generate_until
prompt, gen_kwargs, media_data = request.args
Expand All @@ -431,10 +431,10 @@ def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
results.append(output)

outputs = []
for output in tqdm(results,
desc="Fetching responses",
disable=disable_tqdm):
outputs.append(output.result())
for output in tqdm_with_time_prefix(results,
desc="Fetching responses",
disable=disable_tqdm):
outputs.append(output.result(timeout=RESULT_WAIT_TIMEOUT_SECS))

if self.output_dir:
dump_inference_results(self.output_dir, outputs,
Expand Down
34 changes: 34 additions & 0 deletions tensorrt_llm/evaluate/progress.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from datetime import datetime
from typing import Any

from tqdm import tqdm

_TIME_PREFIX_BAR_FORMAT = "{current_time} {l_bar}{bar}{r_bar}"


class _TimePrefixTqdm(tqdm):
@property
def format_dict(self) -> dict[str, Any]:
format_dict = super().format_dict
format_dict["current_time"] = datetime.now().strftime("%H:%M:%S")
return format_dict


def tqdm_with_time_prefix(*args: Any, **kwargs: Any) -> _TimePrefixTqdm:
"""Return a tqdm progress bar with the current time rendered before the description."""
kwargs.setdefault("bar_format", _TIME_PREFIX_BAR_FORMAT)
return _TimePrefixTqdm(*args, **kwargs)
7 changes: 6 additions & 1 deletion tensorrt_llm/executor/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -990,7 +990,12 @@ def _handle_ray_response(self, response: Any):
return response

def _result_step(self, timeout: Optional[float] = None):
response = self.queue.get()
try:
response = self.queue.get(timeout=timeout)
except Empty:
raise TimeoutError(
f"Request {self.request_id} timed out after {timeout}s "
f"waiting for a response from the executor worker.")
self._handle_response(response)

async def _aresult_step(self):
Expand Down
131 changes: 76 additions & 55 deletions tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,66 +605,67 @@ def test_auto_dtype(self, max_num_tokens):
task.evaluate(llm, sampling_params=self.sampling_params)


# The score here may be lower than VLMEvalKitMcore (official) runs. This path uses
# lm_eval's MMMU task, prompt formatting, and scoring, while VLMEvalKitMcore
# uses MMMU_DEV_VAL with its own MCQ prompt builder, answer extraction, and
# explicit image tiling/token accounting in the Mcore wrapper.
# We also keep the generation budget small for CI speed, and this evaluator
# does not strip reasoning traces after </think> before scoring. If the model
# ignores the non-thinking directive, answer extraction may see the reasoning.
EXTRA_EVALUATOR_KWARGS = dict(
apply_chat_template=True,
is_multimodal=True,
)

# NOTE: MMMU adds <|endoftext|> to the stop token.
sampling_params = SamplingParams(
max_tokens=MMMU.MAX_OUTPUT_LEN,
truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
stop="<|endoftext|>",
temperature=0.0,
top_k=1,
)
MMMU_TASK_SPEC = (MMMU, sampling_params, EXTRA_EVALUATOR_KWARGS)

voxpopuli_sampling_params = SamplingParams(
max_tokens=512,
truncate_prompt_tokens=VoxPopuli.MAX_INPUT_LEN,
temperature=0.0,
top_k=1,
)
no_thinking_evaluator_kwargs = {
# We explicitly disable thinking, because otherwise the thinking traces could
# be absurdly long (20k+ tokens), which is not helpful for test-runtime, nor
# for reproducibility (the more tokens there are, the higher likelihood of the
# end output not being the same).
# In addition, if reasoning is cut off, then the WER goes through the roof,
# since each word in the output is treated as an error.
"chat_template_kwargs": {"enable_thinking": False},
}
VOXPOPULI_TASK_SPEC = (
VoxPopuli,
voxpopuli_sampling_params,
no_thinking_evaluator_kwargs,
)

videomme_sampling_params = SamplingParams(
max_tokens=VideoMME.MAX_OUTPUT_LEN,
truncate_prompt_tokens=VideoMME.MAX_INPUT_LEN,
temperature=0.0,
top_k=1,
)
VIDEOMME_TASK_SPEC = (
VideoMME,
videomme_sampling_params,
no_thinking_evaluator_kwargs,
)


# Skip for B300 / GB300:
# * B300 coverage does not meaningfully extend what we test via B200.
# * GB300 may not be entirely up to date for `llm-models`, leading to repo-wide CI errors.
@skip_post_blackwell_ultra
class TestNanoV3Omni(LlmapiAccuracyTestHarness):
# The score here may be lower than VLMEvalKitMcore (official) runs. This path uses
# lm_eval's MMMU task, prompt formatting, and scoring, while VLMEvalKitMcore
# uses MMMU_DEV_VAL with its own MCQ prompt builder, answer extraction, and
# explicit image tiling/token accounting in the Mcore wrapper.
# We also keep the generation budget small for CI speed, and this evaluator
# does not strip reasoning traces after </think> before scoring. If the model
# ignores the non-thinking directive, answer extraction may see the reasoning.
EXTRA_EVALUATOR_KWARGS = dict(
apply_chat_template=True,
is_multimodal=True,
)

# NOTE: MMMU adds <|endoftext|> to the stop token.
sampling_params = SamplingParams(
max_tokens=MMMU.MAX_OUTPUT_LEN,
truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
stop="<|endoftext|>",
temperature=0.0,
top_k=1,
)
MMMU_TASK_SPEC = (MMMU, sampling_params, EXTRA_EVALUATOR_KWARGS)

voxpopuli_sampling_params = SamplingParams(
max_tokens=512,
truncate_prompt_tokens=VoxPopuli.MAX_INPUT_LEN,
temperature=0.0,
top_k=1,
)
no_thinking_evaluator_kwargs = {
# We explicitly disable thinking, because otherwise the thinking traces could
# be absurdly long (20k+ tokens), which is not helpful for test-runtime, nor
# for reproducibility (the more tokens there are, the higher likelihood of the
# end output not being the same).
# In addition, if reasoning is cut off, then the WER goes through the roof,
# since each word in the output is treated as an error.
"chat_template_kwargs": {"enable_thinking": False},
}
VOXPOPULI_TASK_SPEC = (
VoxPopuli,
voxpopuli_sampling_params,
no_thinking_evaluator_kwargs,
)

videomme_sampling_params = SamplingParams(
max_tokens=VideoMME.MAX_OUTPUT_LEN,
truncate_prompt_tokens=VideoMME.MAX_INPUT_LEN,
temperature=0.0,
top_k=1,
)
VIDEOMME_TASK_SPEC = (
VideoMME,
videomme_sampling_params,
no_thinking_evaluator_kwargs,
)

@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize(
(
Expand Down Expand Up @@ -747,6 +748,26 @@ class TestNanoV3Omni(LlmapiAccuracyTestHarness):
marks=(skip_pre_blackwell,),
id="nvfp4",
),
]
+ [
# TEMPORARY: duplicate the flaky NVFP4 case for B200 CI iteration.
pytest.param(
"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
KvCacheConfig(
free_gpu_memory_fraction=0.8,
mamba_ssm_cache_dtype="float32",
enable_block_reuse=False,
dtype="fp8",
),
64,
QuantAlgo.MIXED_PRECISION,
(MMMU_TASK_SPEC, VOXPOPULI_TASK_SPEC, VIDEOMME_TASK_SPEC),
None,
marks=(skip_pre_blackwell,),
id=f"nvfp4_repeat_{i}",
)
for i in range(1, 11)
],
)
# `torch.compile` uses a thread pool to compile and it's used in audio pre-processing.
Expand Down
Loading
Loading