diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 7dbea59cade8..a35de1f25ce7 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -4467,15 +4467,7 @@ def launchTestJobs(pipeline, testFilter) "DGX_H100-4_GPUs-PyTorch-Ray-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4], "DGX_H100-4_GPUs-AutoDeploy-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4], "DGX_H100-4_GPUs-AutoDeploy-Post-Merge-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4], - "DGX_B200-PyTorch-1": ["auto:dgx-b200-flex", "l0_b200", 1, 9, 1, 1, true], - "DGX_B200-PyTorch-2": ["auto:dgx-b200-flex", "l0_b200", 2, 9, 1, 1, true], - "DGX_B200-PyTorch-3": ["auto:dgx-b200-flex", "l0_b200", 3, 9, 1, 1, true], - "DGX_B200-PyTorch-4": ["auto:dgx-b200-flex", "l0_b200", 4, 9, 1, 1, true], - "DGX_B200-PyTorch-5": ["auto:dgx-b200-flex", "l0_b200", 5, 9, 1, 1, true], - "DGX_B200-PyTorch-6": ["auto:dgx-b200-flex", "l0_b200", 6, 9, 1, 1, true], - "DGX_B200-PyTorch-7": ["auto:dgx-b200-flex", "l0_b200", 7, 9, 1, 1, true], - "DGX_B200-PyTorch-8": ["auto:dgx-b200-flex", "l0_b200", 8, 9, 1, 1, true], - "DGX_B200-PyTorch-9": ["auto:dgx-b200-flex", "l0_b200", 9, 9, 1, 1, true], + "DGX_B200-PyTorch-1": ["auto:dgx-b200-flex", "l0_b200", 1, 1, 1, 1, true], "DGX_B200-AutoDeploy-1": ["auto:dgx-b200-flex", "l0_b200", 1, 1, 1, 1, true], "DGX_B200-Triton-Post-Merge-1": ["auto:dgx-b200-flex", "l0_b200", 1, 1, 1, 1, true], "DGX_B200-PyTorch-Post-Merge-1": ["auto:dgx-b200-flex", "l0_b200", 1, 2, 1, 1, true], diff --git a/scripts/check_test_list.py b/scripts/check_test_list.py index 97bb4332c1d9..59787c46b981 100755 --- a/scripts/check_test_list.py +++ b/scripts/check_test_list.py @@ -720,6 +720,11 @@ def verify_waive_list(llm_src, args): with open(tmp_waives_file, "w") as f: f.writelines(f"{line}\n" for line in sorted(processed_lines)) + if not processed_lines: + print("No integration waive entries found; skipping collection.", + flush=True) + return + subprocess.run( f"cd {llm_src}/tests/integration/defs && " f"pytest --test-list={tmp_waives_file} --output-dir={llm_src} -s --co -q", diff --git a/tensorrt_llm/evaluate/audio_asr.py b/tensorrt_llm/evaluate/audio_asr.py index de866c3b3cef..af725f84b36a 100644 --- a/tensorrt_llm/evaluate/audio_asr.py +++ b/tensorrt_llm/evaluate/audio_asr.py @@ -20,7 +20,6 @@ from typing import Any, Iterable, NamedTuple, Optional import soundfile -from tqdm import tqdm import tensorrt_llm.profiler as profiler from tensorrt_llm.inputs import ( @@ -36,7 +35,13 @@ from tensorrt_llm.logger import logger from tensorrt_llm.sampling_params import SamplingParams -from .interface import Evaluator, get_chat_template_kwargs, get_model_context +from .interface import ( + RESULT_WAIT_TIMEOUT_SECS, + Evaluator, + get_chat_template_kwargs, + get_model_context, +) +from .progress import tqdm_with_time_prefix class MultimodalASRSample(NamedTuple): @@ -174,15 +179,19 @@ def evaluate( input_context = self._make_input_context(llm) dataset = _load_local_hf_dataset(self.dataset_path, self.split) num_samples = self._get_num_samples(dataset) - samples = list(tqdm(self._iter_samples(dataset), desc="Loading samples", total=num_samples)) + samples = list( + tqdm_with_time_prefix( + self._iter_samples(dataset), desc="Loading samples", total=num_samples + ) + ) inputs = [ self._make_input(llm, sample, input_context) - for sample in tqdm(samples, desc="Loading inputs") + for sample in tqdm_with_time_prefix(samples, desc="Loading inputs") ] futures = [] references = [] scoring_samples = [] - for sample, request_input in tqdm( + for sample, request_input in tqdm_with_time_prefix( zip(samples, inputs, strict=True), desc="Submitting requests", total=len(samples) ): params = ( @@ -197,7 +206,10 @@ def evaluate( ) references.append(sample.transcript) scoring_samples.append(_sample_for_scoring(sample)) - outputs = [future.result() for future in tqdm(futures, desc="Fetching responses")] + outputs = [ + future.result(timeout=RESULT_WAIT_TIMEOUT_SECS) + for future in tqdm_with_time_prefix(futures, desc="Fetching responses") + ] profiler.stop("trtllm exec") elapsed_time = profiler.elapsed_time_in_sec("trtllm exec") diff --git a/tensorrt_llm/evaluate/interface.py b/tensorrt_llm/evaluate/interface.py index c27ba88d6327..ca313c37a4ef 100644 --- a/tensorrt_llm/evaluate/interface.py +++ b/tensorrt_llm/evaluate/interface.py @@ -29,6 +29,15 @@ from ..logger import logger from ..sampling_params import SamplingParams +# Per-request upper bound (seconds) on how long an evaluator waits for a single response before +# failing fast. A stalled or dead executor worker would otherwise block `future.result()` +# indefinitely, turning an evaluation into potential hangs. +# This is a backstop: it is intentionally larger than the executor's stall watchdog +# (`TLLM_EXECUTOR_STALL_TIMEOUT_SECS`, default 300s) so the watchdog's more-informative +# `RequestError` normally surfaces first; no healthy single request should come close to it. +RESULT_WAIT_TIMEOUT_SECS = float( + os.environ.get("TLLM_EVAL_RESULT_TIMEOUT_SECS", "600")) + def get_chat_template_kwargs( template_owner: Any, @@ -145,7 +154,7 @@ def evaluate(self, auxiliaries.append(aux) results = [] for output in tqdm(outputs, desc="Fetching responses"): - results.append(output.result()) + results.append(output.result(timeout=RESULT_WAIT_TIMEOUT_SECS)) if self.output_dir: dump_inference_results(self.output_dir, results, diff --git a/tensorrt_llm/evaluate/lm_eval.py b/tensorrt_llm/evaluate/lm_eval.py index d41ede2e4180..518b69a2059a 100644 --- a/tensorrt_llm/evaluate/lm_eval.py +++ b/tensorrt_llm/evaluate/lm_eval.py @@ -21,7 +21,6 @@ import click import numpy as np -from tqdm import tqdm import tensorrt_llm.profiler as profiler from tensorrt_llm.inputs import prompt_inputs @@ -44,8 +43,9 @@ from ..llmapi import RequestOutput from ..logger import logger from ..sampling_params import SamplingParams -from .interface import (Evaluator, dump_inference_results, - get_chat_template_kwargs) +from .interface import (RESULT_WAIT_TIMEOUT_SECS, Evaluator, + dump_inference_results, get_chat_template_kwargs) +from .progress import tqdm_with_time_prefix # NOTE: lm_eval uses "" as the default image placeholder # https://github.com/EleutherAI/lm-evaluation-harness/blob/7f04db12d2f8e7a99a0830d99eb78130e1ba2122/lm_eval/models/hf_vlms.py#L25 @@ -162,9 +162,9 @@ def _get_sampling_params(self, gen_kwargs: dict) -> SamplingParams: def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]: profiler.start("trtllm exec") results = [] - for request in tqdm(requests, - desc="Submitting requests", - disable=disable_tqdm): + for request in tqdm_with_time_prefix(requests, + desc="Submitting requests", + disable=disable_tqdm): prompt, gen_kwargs = request.args sampling_params = self._get_sampling_params(gen_kwargs) output = self.llm.generate_async(prompt, @@ -173,10 +173,10 @@ def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]: results.append(output) outputs = [] - for output in tqdm(results, - desc="Fetching responses", - disable=disable_tqdm): - outputs.append(output.result()) + for output in tqdm_with_time_prefix(results, + desc="Fetching responses", + disable=disable_tqdm): + outputs.append(output.result(timeout=RESULT_WAIT_TIMEOUT_SECS)) if self.output_dir: dump_inference_results(self.output_dir, outputs, @@ -405,9 +405,9 @@ def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]: """ profiler.start("trtllm exec") results = [] - for request in tqdm(requests, - desc="Submitting requests", - disable=disable_tqdm): + for request in tqdm_with_time_prefix(requests, + desc="Submitting requests", + disable=disable_tqdm): # NOTE: For now, only this part is different from the original generate_until prompt, gen_kwargs, media_data = request.args @@ -431,10 +431,10 @@ def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]: results.append(output) outputs = [] - for output in tqdm(results, - desc="Fetching responses", - disable=disable_tqdm): - outputs.append(output.result()) + for output in tqdm_with_time_prefix(results, + desc="Fetching responses", + disable=disable_tqdm): + outputs.append(output.result(timeout=RESULT_WAIT_TIMEOUT_SECS)) if self.output_dir: dump_inference_results(self.output_dir, outputs, diff --git a/tensorrt_llm/evaluate/progress.py b/tensorrt_llm/evaluate/progress.py new file mode 100644 index 000000000000..06132d8c05f2 --- /dev/null +++ b/tensorrt_llm/evaluate/progress.py @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from datetime import datetime +from typing import Any + +from tqdm import tqdm + +_TIME_PREFIX_BAR_FORMAT = "{current_time} {l_bar}{bar}{r_bar}" + + +class _TimePrefixTqdm(tqdm): + @property + def format_dict(self) -> dict[str, Any]: + format_dict = super().format_dict + format_dict["current_time"] = datetime.now().strftime("%H:%M:%S") + return format_dict + + +def tqdm_with_time_prefix(*args: Any, **kwargs: Any) -> _TimePrefixTqdm: + """Return a tqdm progress bar with the current time rendered before the description.""" + kwargs.setdefault("bar_format", _TIME_PREFIX_BAR_FORMAT) + return _TimePrefixTqdm(*args, **kwargs) diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py index 4a16ee76587f..29db0e227e6e 100644 --- a/tensorrt_llm/executor/result.py +++ b/tensorrt_llm/executor/result.py @@ -990,7 +990,12 @@ def _handle_ray_response(self, response: Any): return response def _result_step(self, timeout: Optional[float] = None): - response = self.queue.get() + try: + response = self.queue.get(timeout=timeout) + except Empty: + raise TimeoutError( + f"Request {self.request_id} timed out after {timeout}s " + f"waiting for a response from the executor worker.") self._handle_response(response) async def _aresult_step(self): diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py index bdcfee64ce4f..eb0d67c1326e 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py @@ -605,66 +605,67 @@ def test_auto_dtype(self, max_num_tokens): task.evaluate(llm, sampling_params=self.sampling_params) +# The score here may be lower than VLMEvalKitMcore (official) runs. This path uses +# lm_eval's MMMU task, prompt formatting, and scoring, while VLMEvalKitMcore +# uses MMMU_DEV_VAL with its own MCQ prompt builder, answer extraction, and +# explicit image tiling/token accounting in the Mcore wrapper. +# We also keep the generation budget small for CI speed, and this evaluator +# does not strip reasoning traces after before scoring. If the model +# ignores the non-thinking directive, answer extraction may see the reasoning. +EXTRA_EVALUATOR_KWARGS = dict( + apply_chat_template=True, + is_multimodal=True, +) + +# NOTE: MMMU adds <|endoftext|> to the stop token. +sampling_params = SamplingParams( + max_tokens=MMMU.MAX_OUTPUT_LEN, + truncate_prompt_tokens=MMMU.MAX_INPUT_LEN, + stop="<|endoftext|>", + temperature=0.0, + top_k=1, +) +MMMU_TASK_SPEC = (MMMU, sampling_params, EXTRA_EVALUATOR_KWARGS) + +voxpopuli_sampling_params = SamplingParams( + max_tokens=512, + truncate_prompt_tokens=VoxPopuli.MAX_INPUT_LEN, + temperature=0.0, + top_k=1, +) +no_thinking_evaluator_kwargs = { + # We explicitly disable thinking, because otherwise the thinking traces could + # be absurdly long (20k+ tokens), which is not helpful for test-runtime, nor + # for reproducibility (the more tokens there are, the higher likelihood of the + # end output not being the same). + # In addition, if reasoning is cut off, then the WER goes through the roof, + # since each word in the output is treated as an error. + "chat_template_kwargs": {"enable_thinking": False}, +} +VOXPOPULI_TASK_SPEC = ( + VoxPopuli, + voxpopuli_sampling_params, + no_thinking_evaluator_kwargs, +) + +videomme_sampling_params = SamplingParams( + max_tokens=VideoMME.MAX_OUTPUT_LEN, + truncate_prompt_tokens=VideoMME.MAX_INPUT_LEN, + temperature=0.0, + top_k=1, +) +VIDEOMME_TASK_SPEC = ( + VideoMME, + videomme_sampling_params, + no_thinking_evaluator_kwargs, +) + + # Skip for B300 / GB300: # * B300 coverage does not meaningfully extend what we test via B200. # * GB300 may not be entirely up to date for `llm-models`, leading to repo-wide CI errors. @skip_post_blackwell_ultra class TestNanoV3Omni(LlmapiAccuracyTestHarness): - # The score here may be lower than VLMEvalKitMcore (official) runs. This path uses - # lm_eval's MMMU task, prompt formatting, and scoring, while VLMEvalKitMcore - # uses MMMU_DEV_VAL with its own MCQ prompt builder, answer extraction, and - # explicit image tiling/token accounting in the Mcore wrapper. - # We also keep the generation budget small for CI speed, and this evaluator - # does not strip reasoning traces after before scoring. If the model - # ignores the non-thinking directive, answer extraction may see the reasoning. - EXTRA_EVALUATOR_KWARGS = dict( - apply_chat_template=True, - is_multimodal=True, - ) - - # NOTE: MMMU adds <|endoftext|> to the stop token. - sampling_params = SamplingParams( - max_tokens=MMMU.MAX_OUTPUT_LEN, - truncate_prompt_tokens=MMMU.MAX_INPUT_LEN, - stop="<|endoftext|>", - temperature=0.0, - top_k=1, - ) - MMMU_TASK_SPEC = (MMMU, sampling_params, EXTRA_EVALUATOR_KWARGS) - - voxpopuli_sampling_params = SamplingParams( - max_tokens=512, - truncate_prompt_tokens=VoxPopuli.MAX_INPUT_LEN, - temperature=0.0, - top_k=1, - ) - no_thinking_evaluator_kwargs = { - # We explicitly disable thinking, because otherwise the thinking traces could - # be absurdly long (20k+ tokens), which is not helpful for test-runtime, nor - # for reproducibility (the more tokens there are, the higher likelihood of the - # end output not being the same). - # In addition, if reasoning is cut off, then the WER goes through the roof, - # since each word in the output is treated as an error. - "chat_template_kwargs": {"enable_thinking": False}, - } - VOXPOPULI_TASK_SPEC = ( - VoxPopuli, - voxpopuli_sampling_params, - no_thinking_evaluator_kwargs, - ) - - videomme_sampling_params = SamplingParams( - max_tokens=VideoMME.MAX_OUTPUT_LEN, - truncate_prompt_tokens=VideoMME.MAX_INPUT_LEN, - temperature=0.0, - top_k=1, - ) - VIDEOMME_TASK_SPEC = ( - VideoMME, - videomme_sampling_params, - no_thinking_evaluator_kwargs, - ) - @pytest.mark.skip_less_device_memory(80000) @pytest.mark.parametrize( ( @@ -747,6 +748,26 @@ class TestNanoV3Omni(LlmapiAccuracyTestHarness): marks=(skip_pre_blackwell,), id="nvfp4", ), + ] + + [ + # TEMPORARY: duplicate the flaky NVFP4 case for B200 CI iteration. + pytest.param( + "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4", + f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4", + KvCacheConfig( + free_gpu_memory_fraction=0.8, + mamba_ssm_cache_dtype="float32", + enable_block_reuse=False, + dtype="fp8", + ), + 64, + QuantAlgo.MIXED_PRECISION, + (MMMU_TASK_SPEC, VOXPOPULI_TASK_SPEC, VIDEOMME_TASK_SPEC), + None, + marks=(skip_pre_blackwell,), + id=f"nvfp4_repeat_{i}", + ) + for i in range(1, 11) ], ) # `torch.compile` uses a thread pool to compile and it's used in audio pre-processing. diff --git a/tests/integration/defs/accuracy/video_mme.py b/tests/integration/defs/accuracy/video_mme.py index 86cd625b0fb9..326ae93d21c4 100644 --- a/tests/integration/defs/accuracy/video_mme.py +++ b/tests/integration/defs/accuracy/video_mme.py @@ -18,15 +18,15 @@ from pathlib import Path from typing import Any, Iterable, NamedTuple, Optional -from tqdm import tqdm - import tensorrt_llm.profiler as profiler from tensorrt_llm.evaluate.interface import ( + RESULT_WAIT_TIMEOUT_SECS, Evaluator, dump_inference_results, get_chat_template_kwargs, get_model_context, ) +from tensorrt_llm.evaluate.progress import tqdm_with_time_prefix from tensorrt_llm.inputs import ( ConversationMessage, MultimodalData, @@ -132,15 +132,19 @@ def evaluate( ) -> float: profiler.start("trtllm exec") input_context = self._make_input_context(llm) - samples = list(tqdm(self._iter_samples(), desc="Loading samples", total=self.num_samples)) + samples = list( + tqdm_with_time_prefix( + self._iter_samples(), desc="Loading samples", total=self.num_samples + ) + ) video_cache: dict[str, Any] = {} inputs = [ self._make_input(llm, sample, input_context, video_cache) - for sample in tqdm(samples, desc="Loading inputs") + for sample in tqdm_with_time_prefix(samples, desc="Loading inputs") ] futures = [] - for request_input in tqdm(inputs, desc="Submitting requests"): + for request_input in tqdm_with_time_prefix(inputs, desc="Submitting requests"): params = ( copy.deepcopy(sampling_params) if sampling_params is not None else SamplingParams() ) @@ -151,7 +155,12 @@ def evaluate( streaming=streaming, ) ) - outputs = [future.result() for future in tqdm(futures, desc="Fetching responses")] + # Bound the per-request wait so a stalled/dead worker fails the test fast instead of hanging + # until the outer CI timeout. No healthy single request should come close to this budget. + outputs = [ + future.result(timeout=RESULT_WAIT_TIMEOUT_SECS) + for future in tqdm_with_time_prefix(futures, desc="Fetching responses") + ] if self.output_dir: dump_inference_results(self.output_dir, outputs, getattr(llm, "tokenizer", None)) diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml index 5490ff14ade3..6e9a638df986 100644 --- a/tests/integration/test_lists/test-db/l0_b200.yml +++ b/tests/integration/test_lists/test-db/l0_b200.yml @@ -15,354 +15,17 @@ l0_b200: backend: pytorch tests: # ------------- PyTorch tests --------------- - - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 - - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4_streaming[stream_interval_4] - - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4_streaming[stream_interval_64] - - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_nvfp4_kv[v2_kv_cache=False-attn_backend=TRTLLM-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_nvfp4_kv[v2_kv_cache=False-attn_backend=TRTLLM-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_nvfp4_kv[v2_kv_cache=True-attn_backend=TRTLLM-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestLagunaXS::test_nvfp4 - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True-v2_kv_cache=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True-v2_kv_cache=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_flashinfer[enable_chunked_prefill=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_flashinfer[enable_chunked_prefill=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTEDSL-mtp_nextn=2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=nvfp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-kv_cache_reuse=True-fp8kv=False-overlap_scheduler=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-kv_cache_reuse=False-fp8kv=False-overlap_scheduler=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=nvfp4-kv_cache_reuse=True-fp8kv=False-overlap_scheduler=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=nvfp4-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_dummy_load_format - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_batch_waiting[batch_wait_timeout_iters=10-batch_wait_max_tokens_ratio=1.0-mtp_nextn=0-fp8kv=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-v2_kv_cache=True] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-cutlass-auto] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-trtllm-fp8] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-triton-auto] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v2_kv_cache-True-True-trtllm-fp8] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_dummy_load_format - - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] # Cover nvbugs 5461712 and 5505402 - - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency] - - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRITON] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9-fp8kv=True] - - accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8[enable_block_reuse=True] - - accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[nvfp4] - - accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[fp8_mmmu_encoder_cuda_graph] - - accuracy/test_epd_disagg_multimodal.py::TestVideoMMEEPD::test_disaggregated_videomme[qwen3vl_2b_instruct] - - accuracy/test_epd_disagg_multimodal.py::TestVideoMMEEPD::test_disaggregated_videomme[nemotron_nano_v3_omni_nvfp4] - - accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_bf16_mtp - - disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] # nvbugs 5300551 - - llmapi/test_llm_api_pytorch_bart.py::test_bart_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v1-cuda-graph-off-greedy-bart-large-cnn] - - llmapi/test_llm_api_pytorch_bart.py::test_bart_pytorch_generate_encoder_decoder_end_to_end[fp16-kv-v1-cuda-graph-off-greedy-bart-large-cnn] - - llmapi/test_llm_api_pytorch_bart.py::test_bart_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v1-cuda-graph-off-beam2-bart-large-cnn] - - llmapi/test_llm_api_pytorch_bart.py::test_bart_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v2-cuda-graph-off-greedy-bart-large-cnn] - - llmapi/test_llm_api_pytorch_bart.py::test_bart_pytorch_generate_encoder_decoder_mixed_encoder_lengths_batch[bf16-kv-v1-cuda-graph-off-greedy-batch2-bart-large-cnn] - - llmapi/test_llm_api_pytorch_bart.py::test_bart_pytorch_generate_encoder_decoder_mixed_encoder_lengths_batch[bf16-kv-v2-cuda-graph-off-greedy-batch2-bart-large-cnn] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v1-cuda-graph-off-beam2-t5-small0] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v1-cuda-graph-off-beam2-flan-t5-small] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v1-cuda-graph-off-beam2-t5-base] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v1-cuda-graph-off-beam2-t5-large] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v1-cuda-graph-off-beam2-flan-t5-base] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v1-cuda-graph-off-beam2-flan-t5-large] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v1-cuda-graph-off-beam2-flan-t5-xl] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v1-cuda-graph-off-beam2-flan-t5-xxl] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v1-cuda-graph-off-beam2-t5-small1] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v1-cuda-graph-off-greedy-t5-small] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[fp16-kv-v1-cuda-graph-off-beam2-t5-small] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[fp32-kv-v1-cuda-graph-off-beam2-t5-small] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[fp16-kv-v1-cuda-graph-off-beam2-flan-t5-small] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[fp32-kv-v1-cuda-graph-off-beam2-flan-t5-small] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v2-cuda-graph-off-greedy-t5-small] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[fp16-kv-v2-cuda-graph-off-greedy-t5-small] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[fp32-kv-v2-cuda-graph-off-greedy-t5-small] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v2-cuda-graph-off-greedy-flan-t5-small] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[fp16-kv-v2-cuda-graph-off-greedy-flan-t5-small] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[fp32-kv-v2-cuda-graph-off-greedy-flan-t5-small] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v2-cuda-graph-off-greedy-byt5-small] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_mixed_encoder_lengths_batch[bf16-kv-v1-cuda-graph-off-beam2-batch2-t5-small] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_mixed_encoder_lengths_batch[bf16-kv-v1-cuda-graph-off-beam2-batch2-flan-t5-small] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_mixed_encoder_lengths_batch[bf16-kv-v1-cuda-graph-off-greedy-batch2-t5-small] - - llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_mixed_encoder_lengths_batch[bf16-kv-v2-cuda-graph-off-greedy-batch2-t5-small] - - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B] - - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8] - - test_e2e.py::test_ptp_quickstart_advanced_mtp[DeepSeek-V3-Lite-BF16-DeepSeek-V3-Lite/bf16] - - test_e2e.py::test_ptp_quickstart_advanced_mtp_eagle[DeepSeek-V3-Lite-BF16-DeepSeek-V3-Lite/bf16] - - test_e2e.py::test_ptp_quickstart_advanced_mixed_precision - - test_e2e.py::test_ptp_quickstart_advanced_eagle3[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B] - - test_e2e.py::test_ptp_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct] - - test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-False-False] - - test_e2e.py::test_openai_chat_guided_decoding[openai/gpt-oss-120b] - - unittest/_torch/attention - - unittest/_torch/compilation - - unittest/_torch/debugger - # ------------- modules (non-MoE) --------------- - - unittest/_torch/modules/test_mla_helix.py - - unittest/_torch/modules/test_fused_add_rms_norm_quant.py - - unittest/_torch/modules/test_fused_activation_quant.py - - unittest/_torch/modules/test_awq_quantization.py - - unittest/_torch/modules/test_triton_linear.py - - unittest/_torch/modules/test_group_rmn_norm.py - - unittest/_torch/modules/test_rotary_embedding.py - - unittest/_torch/modules/mamba - - unittest/_torch/modules/tests_lora_modules - # ------------- MoE components tests --------------- - - unittest/_torch/modules/test_moe_load_balancer.py - - unittest/_torch/modules/test_moe_routing.py - - unittest/_torch/modules/test_moe_host_sharer.py - - unittest/_torch/modules/fused_moe/test_deepgemm_fused_gather_finalize.py - - unittest/_torch/modules/fused_moe/test_deepgemm_fused_expand_quant.py - # ------------- legacy MoE tests --------------- - - unittest/_torch/modules/test_fused_moe.py - # ------------- MoE: test_moe_backend (by backend) --------------- - - unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "CUTLASS" - - unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "TRTLLM" - - unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "CUTEDSL and not MEGAMOE_CUTEDSL" - - unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "DEEPGEMM and not MEGAMOE_DEEPGEMM" - - unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "DENSEGEMM" - - unittest/_torch/modules/moe/test_moe_backend.py::test_trtllm_bf16_unquantized_moe - - unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "MEGAMOE_CUTEDSL" - - unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "MEGAMOE_DEEPGEMM" - # ------------- MoE: test_single_gpu (by backend) --------------- - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "CUTLASS" - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "TRTLLM" - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "CUTEDSL and not MEGAMOE_CUTEDSL" - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "DEEPGEMM and not MEGAMOE_DEEPGEMM" - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "DENSEGEMM" - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "MEGAMOE_CUTEDSL" - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "MEGAMOE_DEEPGEMM" - # ------------- MoE: FlashInfer & TRTLLM symbol collision tests --------------- - - unittest/_torch/flashinfer/test_trtllm_flashinfer_symbol_collision.py - # --- MoE end - # B-tier only runs the FP8 MoE parametrize (DEEPGEMM path is SM100-only via - # _get_moe_config_for_blackwell); dense bf16 variants are covered on Hopper. - # The other 5 multimodal files are HW-agnostic Python plumbing and run on Hopper only. - - unittest/_torch/multimodal/test_mm_encoder_standalone.py -k "qwen3_30b_a3b_fp8" - - unittest/_torch/sampler - - unittest/_torch/speculative/test_eagle3.py - - unittest/_torch/thop/parallel TIMEOUT (90) - - unittest/_torch/thop/serial - - unittest/_torch/modeling -k "modeling_llama" - - unittest/_torch/modeling -k "modeling_mixtral" - - unittest/_torch/modeling -k "modeling_gpt_oss" - - unittest/_torch/modeling/test_modeling_afmoe.py - - unittest/_torch/modeling/test_modeling_exaone_moe.py - - unittest/_torch/modeling/test_modeling_gemma4.py - - unittest/_torch/modeling/test_gemma4_multimodal.py - - unittest/_torch/modeling/test_gemma4_e2e_dummy.py::test_e2e_text_26b_dummy - - unittest/_torch/modeling/test_gemma4_e2e_dummy.py::test_e2e_text_e2b_dummy - - unittest/_torch/modeling/test_gemma4_e2e_dummy.py::test_e2e_text_31b_dummy - - unittest/_torch/modeling/test_gemma4_e2e_dummy.py::test_e2e_text_e4b_dummy - - unittest/_torch/modeling/test_gemma4_e2e_dummy.py::test_e2e_multimodal_26b_dummy - - unittest/tools/test_layer_wise_benchmarks.py::test_deepseek_r1_ctx_dep[1] - - unittest/tools/test_layer_wise_benchmarks.py::test_nemotron_gen_dep[1] - - unittest/tools/test_layer_wise_benchmarks.py::test_qwen3_next_gen_tep[1] - - unittest/tools/test_layer_wise_benchmarks.py::test_performance_alignment[1] - - unittest/kv_cache_manager_v2_tests/ - # ------------- KV Cache V2 Scheduler IT --------------- - - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2Llama::test_v2_vs_v1_basic - - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2Llama::test_token_budget_limited - - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2Llama::test_chunked_prefill - - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2Llama::test_chunked_prefill_multi_request - - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2Llama::test_eviction[cuda_graph] - - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2Llama::test_eviction[no_cuda_graph] - - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2Llama::test_batch_size_limited - - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2Llama::test_overlap_scheduler[non_overlap] - - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2Llama::test_overlap_scheduler[overlap] - - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2Llama::test_block_reuse - - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2Llama::test_partial_block_reuse - - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2Llama::test_chunked_prefill_with_eviction - - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2Llama::test_eviction_with_block_reuse - - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2Llama::test_chunked_prefill_eviction_block_reuse - - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2Llama::test_eviction_overlap - - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2LoRA::test_lora_v2 - - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2LoRA::test_lora_multi_adapter_v2 - - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2LoRA::test_lora_chunked_prefill - - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2LoRA::test_lora_eviction - # ------------- KV Cache Iteration Stats --------------- - - unittest/executor/test_stats_serializer.py - - unittest/metrics/test_collector.py - - kv_cache/test_kv_cache_iteration_stats.py::TestKvCacheIterationStats::test_cold_start - - kv_cache/test_kv_cache_iteration_stats.py::TestKvCacheIterationStats::test_partial_block_reuse - - kv_cache/test_kv_cache_iteration_stats.py::TestKvCacheIterationStats::test_full_block_reuse - - kv_cache/test_kv_cache_iteration_stats.py::TestKvCacheIterationStats::test_shared_prefix - - kv_cache/test_kv_cache_iteration_stats.py::TestKvCacheIterationStats::test_batch_generation - - kv_cache/test_kv_cache_iteration_stats.py::TestKvCacheIterationStats::test_long_context - - kv_cache/test_kv_cache_iteration_stats.py::TestKvCacheIterationStats::test_rapid_fire - - kv_cache/test_kv_cache_iteration_stats.py::TestKvCacheIterationStats::test_field_completeness - # ------------- Prefix-aware scheduling E2E tests --------------- - - kv_cache/test_prefix_aware_scheduling.py::TestServePrefixAwareScheduling::test_multi_round_qa_shared_prefix_smoke - # ------------- Visual Gen tests --------------- - - unittest/_torch/visual_gen/test_visual_gen_args.py - - unittest/_torch/visual_gen/test_visual_gen_params.py - - unittest/_torch/visual_gen/test_visual_gen_utils.py - - unittest/_torch/visual_gen/test_warmup.py - - unittest/_torch/visual_gen/test_teacache.py - - unittest/_torch/visual_gen/test_cache_dit.py - - unittest/_torch/visual_gen/test_quant_ops.py - - unittest/_torch/visual_gen/test_attention_cute_dsl.py - - unittest/_torch/visual_gen/test_attention_trtllm_sage.py - - unittest/_torch/visual_gen/test_attention_integration.py - - unittest/_torch/visual_gen/test_attention_perf.py - - unittest/_torch/visual_gen/test_trtllm_serve_endpoints.py - - unittest/_torch/visual_gen/test_trtllm_serve_e2e.py - - unittest/_torch/visual_gen/test_model_loader.py - - unittest/_torch/visual_gen/test_flux_transformer.py - - unittest/_torch/visual_gen/test_flux_attention.py - - unittest/_torch/visual_gen/test_flux_pipeline.py - - unittest/_torch/visual_gen/test_ltx2_transformer.py - - unittest/_torch/visual_gen/test_ltx2_attention.py - - unittest/_torch/visual_gen/test_ltx2_pipeline.py - - unittest/_torch/visual_gen/test_wan21_i2v_pipeline.py - - unittest/_torch/visual_gen/test_wan21_t2v_pipeline.py - - unittest/_torch/visual_gen/test_wan22_i2v_pipeline.py - - unittest/_torch/visual_gen/test_wan22_t2v_pipeline.py - - unittest/_torch/visual_gen/test_wan22_ti2v_5b_pipeline.py - - unittest/_torch/visual_gen/test_wan21_i2v_teacache.py - - unittest/_torch/visual_gen/test_wan21_t2v_teacache.py - - unittest/_torch/visual_gen/test_wan21_t2v_teacache_user_coefficients.py - - unittest/_torch/visual_gen/test_wan22_i2v_teacache.py - - unittest/_torch/visual_gen/test_wan22_t2v_teacache.py - - unittest/_torch/visual_gen/test_wan_transformer.py - - unittest/_torch/visual_gen/test_cosmos3_transformer.py - - unittest/_torch/visual_gen/test_cosmos3_pipeline.py - - examples/visual_gen/test_visual_gen.py::test_wan_t2v_example - - examples/visual_gen/test_visual_gen.py::test_flux1_example - - examples/visual_gen/test_visual_gen.py::test_flux2_example - - examples/visual_gen/test_visual_gen.py::test_ltx2_example - - examples/visual_gen/test_visual_gen.py::test_wan_i2v_example - - examples/visual_gen/test_visual_gen.py::test_cosmos3_example - - examples/visual_gen/test_visual_gen.py::test_qwen_image_example - # - examples/visual_gen/test_visual_gen.py - # ------------- Host perf module regression tests (6 representative scenarios) --------------- - - perf/host_perf/test_module_scheduler.py::test_scheduler_production[production_gen_only_bs8] - - perf/host_perf/test_module_scheduler.py::test_scheduler_production[production_mixed_32gen_4ctx] - - perf/host_perf/test_module_sampler.py::test_sampler_update_greedy[greedy_bs8] - - perf/host_perf/test_module_sampler.py::test_sampler_update_stop_words[stopwords_bs32] - - perf/host_perf/test_module_resource_manager.py::test_kv_cache_prepare_generation - - perf/host_perf/test_module_resource_manager.py::test_kv_cache_prepare_context - # ------------- Host perf E2E regression tests (reuse perf_sanity with host-overhead-dominant configs) --------------- - - perf/test_perf_sanity.py::test_e2e[aggr_upload-host_perf_llama8b-llama8b_fp16_bs8_128_256] - - perf/test_perf_sanity.py::test_e2e[aggr_upload-host_perf_deepseek_v3_lite-v3lite_fp8_bs8_128_256] - - perf/test_perf_sanity.py::test_e2e[aggr_upload-host_perf_llama8b_spec_decode-llama8b_spec_bs1_128_128] -- condition: - ranges: - system_gpu_count: - gte: 1 - lte: 1 - wildcards: - gpu: - - '*b100*' - - '*b200*' - linux_distribution_name: ubuntu* - terms: - stage: post_merge - backend: tensorrt - tests: - # ------------- TRT tests --------------- - - accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4 - - accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[disable_norm_quant_fusion-disable_fused_quant] - - accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[disable_norm_quant_fusion-enable_fused_quant] - - accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[enable_norm_quant_fusion-disable_fused_quant] - - accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[enable_norm_quant_fusion-enable_fused_quant] - - accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_auto_dtype - - accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_fp8 - - unittest/trt/attention/test_gpt_attention.py -k "trtllm_gen" - - unittest/llmapi/test_llm_quant.py # 3.5 mins on B200 - - unittest/trt/functional/test_fp4_gemm.py # 3 mins on B200 -- condition: - ranges: - system_gpu_count: - gte: 1 - lte: 1 - wildcards: - gpu: - - '*b100*' - - '*b200*' - linux_distribution_name: ubuntu* - terms: - stage: post_merge - backend: triton - tests: - - triton_server/test_triton.py::test_llava[llava] - - triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning] - - triton_server/test_triton.py::test_gpt_2b_ib_lora[gpt-2b-ib-lora] -- condition: - ranges: - system_gpu_count: - gte: 1 - lte: 1 - wildcards: - gpu: - - '*b100*' - - '*b200*' - linux_distribution_name: ubuntu* - terms: - stage: post_merge - backend: pytorch - tests: - # ------------- PyTorch tests --------------- - # Covered by H100 pre_merge for primary HW-agnostic signal; keep B200 runtime - # canary in post_merge for CUDA IPC / virtual memory / profiling paths. - - unittest/_torch/misc - - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1_block_reuse-cutlass] - - accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8[enable_block_reuse=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTEDSL-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_python_scheduler[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-enable_chunked_prefill=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_python_scheduler[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-enable_chunked_prefill=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_python_scheduler[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-enable_chunked_prefill=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_python_scheduler[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-enable_chunked_prefill=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_python_scheduler[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-enable_chunked_prefill=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_python_scheduler[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-enable_chunked_prefill=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_python_scheduler[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-enable_chunked_prefill=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_python_scheduler[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-enable_chunked_prefill=True] - - accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_bf16[tp1-CUTLASS] - - accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_bf16[tp1-TRTLLM] - - accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype - - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B_Instruct_RocketKV::test_auto_dtype - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-trtllm-auto] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v2_kv_cache-True-True-trtllm-auto] - - accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[bf16] - # ------------- VisualGen single-GPU tests --------------- - - examples/visual_gen/test_visual_gen.py::test_visual_gen_quickstart - - examples/visual_gen/test_visual_gen.py::test_visual_gen_api_walkthrough - - examples/visual_gen/test_visual_gen.py::test_flux1_lpips_against_golden - - examples/visual_gen/test_visual_gen.py::test_flux2_lpips_against_golden - - examples/visual_gen/test_visual_gen.py::test_ltx2_lpips_against_golden - - examples/visual_gen/test_visual_gen.py::test_wan21_t2v_lpips_against_golden - - examples/visual_gen/test_visual_gen.py::test_wan22_t2v_lpips_against_golden - - visual_gen/test_visual_gen_benchmark.py::test_offline_benchmark - - visual_gen/test_visual_gen_benchmark.py::test_online_benchmark[openai-videos] + # TEMPORARY: narrowed for local CI iteration. Revert before merging. + - accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[nvfp4_repeat_1] + - accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[nvfp4_repeat_2] + - accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[nvfp4_repeat_3] + - accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[nvfp4_repeat_4] + - accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[nvfp4_repeat_5] + - accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[nvfp4_repeat_6] + - accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[nvfp4_repeat_7] + - accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[nvfp4_repeat_8] + - accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[nvfp4_repeat_9] + - accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[nvfp4_repeat_10] # ------------- AutoDeploy Backend Stages --------------- - condition: ranges: diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index c97a18d781e0..e69de29bb2d1 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -1,436 +0,0 @@ -accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] SKIP (https://nvbugs/6120535) -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2] SKIP (https://nvbugs/6075533) -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugs/6245651) -accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5748664) -accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] SKIP (https://nvbugs/5346443) -accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8 SKIP (https://nvbugs/5413197) -accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[bf16-4-attn_dp_off-trtllm] SKIP (https://nvbugs/6367792) -accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[fp8-4-attn_dp_off-trtllm] SKIP (https://nvbugs/6367792) -accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[nvfp4-4-attn_dp_off-trtllm] SKIP (https://nvbugs/6367792) -accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_mtp[fp8_ws4_80gb-trtllm] SKIP (https://nvbugs/6336682) -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] SKIP (https://nvbugs/6281818) -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] SKIP (https://nvbugs/6281818) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[latency] SKIP (https://nvbugs/6276981) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_piecewise_cuda_graph[mtp3_fp8kv_chunked] SKIP (https://nvbugs/5989920) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6084720) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6095851) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6278337) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6278337) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6278337) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6198785) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2-moe_backend=WIDEEP] SKIP (https://nvbugs/6313993) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_cute_dsl_bf16_gemm_4gpus[tp4-cuda_graph=False] SKIP (https://nvbugs/6224636) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True] SKIP (https://nvbugs/5955773) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5945081) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6278403) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6272673) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6245394) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[v2_kv_cache-trtllm-one_model-overlap_scheduler] SKIP (https://nvbugs/6341371) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model] SKIP (https://nvbugs/5596343) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model] SKIP (https://nvbugs/5596343) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-triton-auto] SKIP (https://nvbugs/6026676) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-cutlass-auto] SKIP (https://nvbugs/5596343) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-auto] SKIP (https://nvbugs/5596343) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-fp8] SKIP (https://nvbugs/5651865) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-cutlass-auto] SKIP (https://nvbugs/5596343) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-cutlass-auto] SKIP (https://nvbugs/5596343) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-tp4-cutlass-auto] SKIP (https://nvbugs/5596343) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto] SKIP (https://nvbugs/5596343) -accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype SKIP (https://nvbugs/6209806) -accuracy/test_llm_api_pytorch.py::TestKimiK25::test_nvfp4[dep8] SKIP (https://nvbugs/6260890) -accuracy/test_llm_api_pytorch.py::TestKimiK25::test_nvfp4[tp8] SKIP (https://nvbugs/6248837) -accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus] SKIP (https://nvbugs/6368562) -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5616182) -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/6278337) -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance] SKIP (https://nvbugs/6076767) -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] SKIP (https://nvbugs/6256531) -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=True-enable_gemm_allreduce_fusion=True] SKIP (https://nvbugs/6211441) -accuracy/test_llm_api_pytorch.py::TestMiniMaxM2::test_4gpus[attention_dp=False-cuda_graph=True-overlap_scheduler=True-tp_size=4-ep_size=4] SKIP (https://nvbugs/6159132) -accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_nvfp4_4gpus[latency_moe_trtllm] SKIP (https://nvbugs/6248827) -accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_nvfp4_4gpus[latency_moe_trtllm_eagle] SKIP (https://nvbugs/6157892) -accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype SKIP (https://nvbugs/6076767) -accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_bf16_4gpu[tep4] SKIP (https://nvbugs/6255417) -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency] SKIP (https://nvbugs/6177390) -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] SKIP (https://nvbugs/6177390) -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.5-fp8kv=True] SKIP (https://nvbugs/6248783) -accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_bf16_mtp SKIP (https://nvbugs/6206179) -accuracy/test_llm_api_pytorch.py::TestQwen3_5_397B_A17B::test_nvfp4[tep4_cutedsl] SKIP (https://nvbugs/6255417) -accuracy/test_llm_api_pytorch.py::TestQwen3_5_4B::test_bf16 SKIP (https://nvbugs/6283537) -accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[eagle3_one_model=True-enable_chunked_prefill=False-enable_max_concurrency=True-enable_draft_len_schedule=False] SKIP (https://nvbugs/6368874) -accuracy/test_llm_api_pytorch.py::TestStep3_7::test_fp8_block_scales[tp_size=4-ep_size=4-mtp_nextn=3] SKIP (https://nvbugs/6367805) -accuracy/test_llm_api_pytorch.py::TestStep3_7::test_nvfp4[tp_size=4-ep_size=4-mtp_nextn=3] SKIP (https://nvbugs/6367805) -accuracy/test_llm_api_pytorch_multimodal.py::TestMistralLarge3_675B::test_nvfp4_4gpus[latency_moe_trtllm] SKIP (https://nvbugs/6248827) -accuracy/test_llm_api_pytorch_multimodal.py::TestNanoV3Omni::test_auto_dtype[nvfp4] SKIP (https://nvbugs/6336747) -accuracy/test_llm_api_pytorch_multimodal.py::TestStep3_7::test_fp8_block_scales[mtp_nextn=3] SKIP (https://nvbugs/6274932) -accuracy/test_llm_api_pytorch_multimodal.py::TestStep3_7::test_nvfp4[mtp_nextn=3] SKIP (https://nvbugs/6367805) -cpp/test_e2e.py::test_benchmarks[bart-90] SKIP (https://nvbugs/5550689) -cpp/test_e2e.py::test_benchmarks[gpt-80] SKIP (https://nvbugs/5550689) -cpp/test_e2e.py::test_model[-bart-90] SKIP (https://nvbugs/6162804) -cpp/test_e2e.py::test_model[-encoder-90] SKIP (waive Encoder-only test because it doesn't take batched input) -cpp/test_e2e.py::test_model[-gpt-80] SKIP (https://nvbugs/5983283) -cpp/test_e2e.py::test_model[-mamba-86] SKIP (https://nvbugs/5781665) -cpp/test_e2e.py::test_model[-redrafter-86] SKIP (https://nvbugs/5761642) -cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90] SKIP (https://nvbugs/5755941) -cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-nixl_kvcache-90] SKIP (https://nvbugs/6093820) -cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-ucx_kvcache-90] SKIP (https://nvbugs/6093820) -cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5838199) -disaggregated/test_disaggregated.py::test_disaggregated_cancel_large_context_requests[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6105768) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_cache_aware_balance[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6162322) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_conditional[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6162322) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_gen_only[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_overlap[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_overlap_cuda_graph[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ctxpp2_gentp2_one_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ctxtp2ep2pp2_gentp4_one_mtp_block_reuse[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ctxtp2ep2pp2_gentp4_one_mtp_block_reuse_long_prompt[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_overlap_cuda_graph[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_attention_dp_overlap_one_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_two_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/6162322) -disaggregated/test_disaggregated.py::test_disaggregated_genbs1[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6162322) -disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_llama_context_capacity[False-False-DeepSeek-V3-Lite-fp8/fp8] SKIP (https://nvbugs/6266302) -disaggregated/test_workers.py::test_workers_conversation_router[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6162322) -disaggregated/test_workers.py::test_workers_kv_cache_aware_router_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6162322) -disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6162322) -disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6114139) -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058) -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] SKIP (https://nvbugs/5234058) -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2] SKIP (https://nvbugs/5234058) -examples/test_gpt.py::test_llm_minitron_fp8_with_pseudo_loras[4b] SKIP (https://nvbugs/5606233) -examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] SKIP (https://nvbugs/5431132) -examples/test_granite.py::test_llm_granite[granite-3.0-1b-a400m-instruct-bfloat16] SKIP (https://nvbugs/5608979) -examples/test_granite.py::test_llm_granite[granite-3.0-2b-instruct-bfloat16] SKIP (https://nvbugs/5608979) -examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (https://nvbugs/5802248) -examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (https://nvbugs/5333849) -examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5222697) -examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818) -examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818) -examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:2-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818) -examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (https://nvbugs/4961624) -examples/test_nemotron_nas.py::test_nemotron_nas_summary_1gpu[DeciLM-7B] SKIP (https://nvbugs/5444636) -examples/test_nemotron_nas.py::test_nemotron_nas_summary_2gpu[DeciLM-7B] SKIP (https://nvbugs/5444636) -examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct] SKIP (https://nvbugs/5447530) -examples/test_ray.py::test_ray_disaggregated_serving[tp2] SKIP (https://nvbugs/5612502) -examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-disable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/5244570) -examples/visual_gen/test_visual_gen.py::test_flux1_lpips_against_golden SKIP (https://nvbugs/6215688) -examples/visual_gen/test_visual_gen.py::test_flux2_lpips_against_golden SKIP (https://nvbugs/6215688) -examples/visual_gen/test_visual_gen.py::test_ltx2_lpips_against_golden SKIP (https://nvbugs/6215688) -examples/visual_gen/test_visual_gen.py::test_vbench_dimension_score_wan22_a14b_fp8 SKIP (https://nvbugs/6310230) -examples/visual_gen/test_visual_gen.py::test_vbench_dimension_score_wan22_a14b_nvfp4 SKIP (https://nvbugs/6310230) -examples/visual_gen/test_visual_gen.py::test_wan21_t2v_lpips_against_golden SKIP (https://nvbugs/6215688) -examples/visual_gen/test_visual_gen.py::test_wan22_t2v_lpips_against_golden SKIP (https://nvbugs/6215688) -examples/visual_gen/test_visual_gen.py::test_wan_t2v_example SKIP (https://nvbugs/6215688) -examples/visual_gen/test_visual_gen_multi_gpu.py::test_wan22_t2v_lpips_against_golden_multi_gpu[attn2d_2x2] SKIP (https://nvbugs/6272644) -examples/visual_gen/test_visual_gen_multi_gpu.py::test_wan22_t2v_lpips_against_golden_multi_gpu[cfg2_ulysses2] SKIP (https://nvbugs/6272644) -examples/visual_gen/test_visual_gen_multi_gpu.py::test_wan22_t2v_lpips_against_golden_multi_gpu[ulysses4] SKIP (https://nvbugs/6272644) -examples/visual_gen/test_visual_gen_multi_gpu.py::test_wan22_t2v_lpips_against_golden_tp[cfg2_tp2] SKIP (https://nvbugs/6329227) -examples/visual_gen/test_visual_gen_multi_gpu.py::test_wan22_t2v_lpips_against_golden_tp[tp2] SKIP (https://nvbugs/6329227) -examples/visual_gen/test_visual_gen_multi_gpu.py::test_wan22_t2v_lpips_against_golden_tp[tp2_ulysses2] SKIP (https://nvbugs/6329227) -full:A100/accuracy/test_disaggregated_serving.py::TestNemotron3Super120B::test_auto_dtype[mtp_nextn=0-block_reuse=False-use_py_transceiver=False] SKIP (https://nvbugs/6322076) -full:A100/accuracy/test_disaggregated_serving.py::TestNemotron3Super120B::test_auto_dtype[mtp_nextn=0-block_reuse=False-use_py_transceiver=True] SKIP (https://nvbugs/6322076) -full:A100/accuracy/test_disaggregated_serving.py::TestNemotron3Super120B::test_auto_dtype[mtp_nextn=3-block_reuse=True-use_py_transceiver=False] SKIP (https://nvbugs/6344108) -full:A100/accuracy/test_disaggregated_serving.py::TestNemotron3Super120B::test_ctx_dp2_gen_tp4 SKIP (https://nvbugs/6344108) -full:A100/accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_bf16[tp1-CUTLASS] SKIP (https://nvbugs/6273850) -full:A100/accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_bf16_mtp SKIP (https://nvbugs/6239637) -full:A100/accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8[enable_block_reuse=False] SKIP (https://nvbugs/6315645) -full:A100/accuracy/test_llm_api_pytorch.py::TestStep3_7::test_fp8_block_scales[tp_size=4-ep_size=4-mtp_nextn=0] SKIP (https://nvbugs/6274932) -full:A100/accuracy/test_llm_api_pytorch.py::TestStep3_7::test_fp8_block_scales[tp_size=4-ep_size=4-mtp_nextn=3] SKIP (https://nvbugs/6274932) -full:A100/accuracy/test_llm_api_pytorch_multimodal.py::TestStep3_7::test_fp8_block_scales[mtp_nextn=0] SKIP (https://nvbugs/6274932) -full:A100/accuracy/test_llm_api_pytorch_multimodal.py::TestStep3_7::test_fp8_block_scales[mtp_nextn=3] SKIP (https://nvbugs/6274932) -full:A100/disaggregated/test_workers.py::test_workers_conditional_disaggregation_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6329052) -full:A100X/llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp SKIP (https://nvbugs/6287561) -full:B200/accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] SKIP (https://nvbugs/5970614) -full:B200/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/6344612) -full:B200/accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=True] SKIP (https://nvbugs/6331421) -full:B200/accuracy/test_llm_api_pytorch.py::TestNemotronV3Ultra::test_nvfp4_4gpus_block_reuse[TEP4] SKIP (https://nvbugs/6317074) -full:B200/accuracy/test_llm_api_pytorch.py::TestQwen3_5_4B::test_dflash SKIP (https://nvbugs/6344883) -full:B200/accuracy/test_llm_api_pytorch.py::TestStep3_7::test_auto_dtype[tp_size=8-ep_size=8] SKIP (https://nvbugs/6278377) -full:B200/accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype SKIP (https://nvbugs/6316983) -full:B200/disaggregated/test_disaggregated.py::test_disaggregated_overlap_gen_first[ctx_pp4-TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6344107) -full:B200/disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-qwen3_32b_fp8_stress] SKIP (https://nvbugs/6312828) -full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_tensor] SKIP (https://nvbugs/5161074) -full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugs/5161074) -full:B200/perf/test_perf.py::test_perf[quant:w4a8_awq] SKIP (https://nvbugs/5161074) -full:B300/accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo_v1-cudagraph:with_padding-pp1dp2cp2] SKIP (https://nvbugs/6322076) -full:B300/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6322076) -full:B300/accuracy/test_llm_api_pytorch.py::TestKimiK25::test_nvfp4[ep8] SKIP (https://nvbugs/6322076) -full:B300/disaggregated/test_disaggregated.py::test_disaggregated_ctxpp2_genpp2[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6322073) -full:B300/unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "TRTLLM" SKIP (https://nvbugs/6165866) -full:DGX_B200/unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "TRTLLM" SKIP (https://nvbugs/6165866) -full:DGX_H100/accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_mtp[fp8_ws4_80gb-trtllm] SKIP (https://nvbugs/6336682) -full:GB200/accuracy/test_dwdp_disaggregated_serving.py::TestDwdpDeepSeekV3Lite::test_dwdp_accuracy SKIP (https://nvbugs/6276923) -full:GB200/accuracy/test_dwdp_disaggregated_serving.py::TestDwdpDeepSeekV3Lite::test_dwdp_accuracy_contention_opt SKIP (https://nvbugs/6276923) -full:GB200/accuracy/test_dwdp_disaggregated_serving.py::TestDwdpDeepSeekV3Lite::test_dwdp_accuracy_mode_b_overlap SKIP (https://nvbugs/6276923) -full:GB200/accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_mtp[fp8_ws4_80gb-trtllm] SKIP (https://nvbugs/6316981) -full:GB200/accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8_moe_dflash SKIP (https://nvbugs/6316985) -full:GB200/accuracy/test_llm_api_pytorch.py::TestQwen3_5_4B::test_dflash SKIP (https://nvbugs/6344883) -full:GB200/accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype SKIP (https://nvbugs/6316983) -full:GB200/disaggregated/test_ad_disagg.py::test_async_eagle3_full_model_handoff SKIP (https://nvbugs/6369254) -full:GB300/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-False-False-False] SKIP (https://nvbugs/6316984) -full:GB300/accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[nvfp4-1-attn_dp_off-trtllm] SKIP (https://nvbugs/6329165) -full:GB300/accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_mtp[bf16_ws4_180gb-trtllm] SKIP (https://nvbugs/6316981) -full:GB300/accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_mtp[fp8_ws4_80gb-trtllm] SKIP (https://nvbugs/6316981) -full:GB300/accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_mtp[nvfp4_ws4_80gb-trtllm] SKIP (https://nvbugs/6316981) -full:GB300/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-trtllm-fp8] SKIP (https://nvbugs/6316980) -full:GB300/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache_no_reuse-tp4-trtllm-fp8] SKIP (https://nvbugs/6316980) -full:GB300/accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8_moe_dflash SKIP (https://nvbugs/6316985) -full:GB300/accuracy/test_llm_api_pytorch.py::TestQwen3_5_397B_A17B::test_nvfp4[tep4_trtllm] SKIP (https://nvbugs/6317600) -full:GB300/accuracy/test_llm_api_pytorch.py::TestQwen3_5_4B::test_dflash SKIP (https://nvbugs/6344883) -full:GB300/accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype SKIP (https://nvbugs/6316983) -full:GB300/disaggregated/test_auto_scaling.py::test_worker_restart[http-round_robin] SKIP (https://nvbugs/6344884) -full:GH200/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/4731514) -full:GH200/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (arm is not supported) -full:GH200/examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct] SKIP (arm is not supported) -full:GH200/unittest/trt/model_api/test_model_quantization.py SKIP (https://nvbugs/4979955) -full:H100/accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=False] SKIP (https://nvbugs/6313072) -full:H100/accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True] SKIP (https://nvbugs/6313072) -full:H100/accuracy/test_disaggregated_serving.py::TestNemotron3Super120B::test_auto_dtype[mtp_nextn=3-block_reuse=True-use_py_transceiver=False] SKIP (https://nvbugs/6344108) -full:H100/accuracy/test_disaggregated_serving.py::TestNemotron3Super120B::test_ctx_dp2_gen_tp4 SKIP (https://nvbugs/6344108) -full:H100/accuracy/test_llm_api_pytorch.py::TestStep3_7::test_fp8_block_scales[tp_size=4-ep_size=4-mtp_nextn=0] SKIP (https://nvbugs/6274932) -full:H100/accuracy/test_llm_api_pytorch.py::TestStep3_7::test_fp8_block_scales[tp_size=4-ep_size=4-mtp_nextn=3] SKIP (https://nvbugs/6274932) -full:H100/accuracy/test_llm_api_pytorch_multimodal.py::TestStep3_7::test_fp8_block_scales[mtp_nextn=0] SKIP (https://nvbugs/6274932) -full:H100/accuracy/test_llm_api_pytorch_multimodal.py::TestStep3_7::test_fp8_block_scales[mtp_nextn=3] SKIP (https://nvbugs/6274932) -full:H100/disaggregated/test_disaggregated.py::test_disaggregated_logprobs_serving[llama-3.1-8b-instruct] SKIP (https://nvbugs/6275959) -full:H100/disaggregated/test_disaggregated.py::test_disaggregated_overlap_gen_first[ctx_pp4-TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6344107) -full:H100/disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-gpt_oss_120b_eagle_triton_stress] SKIP (https://nvbugs/6250439) -full:H100/disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-qwen3_32b_fp8_stress] SKIP (https://nvbugs/6312828) -full:H100_PCIe/unittest/llmapi/test_llm_pytorch.py::test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache SKIP (https://nvbugs/5682551) -full:H20/accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=False] SKIP (https://nvbugs/6345827) -full:H20/accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True] SKIP (https://nvbugs/6345827) -full:H20/accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2] SKIP (https://nvbugs/6313314) -full:H20/accuracy/test_disaggregated_serving.py::TestNemotron3Super120B::test_auto_dtype[mtp_nextn=3-block_reuse=True-use_py_transceiver=False] SKIP (https://nvbugs/6344108) -full:H20/accuracy/test_disaggregated_serving.py::TestNemotron3Super120B::test_ctx_dp2_gen_tp4 SKIP (https://nvbugs/6344108) -full:H20/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto] SKIP (https://nvbugs/6026676) -full:H20/accuracy/test_llm_api_pytorch.py::TestStep3_7::test_fp8_block_scales[tp_size=4-ep_size=4-mtp_nextn=0] SKIP (https://nvbugs/6274932) -full:H20/accuracy/test_llm_api_pytorch.py::TestStep3_7::test_fp8_block_scales[tp_size=4-ep_size=4-mtp_nextn=3] SKIP (https://nvbugs/6274932) -full:H20/accuracy/test_llm_api_pytorch_encode.py::TestDecoderEncode::test_decoder_encode_cuda_graph_matches_eager_logits[tinyllama-1.1b] SKIP (https://nvbugs/6276842) -full:H20/accuracy/test_llm_api_pytorch_multimodal.py::TestStep3_7::test_fp8_block_scales[mtp_nextn=0] SKIP (https://nvbugs/6274932) -full:H20/accuracy/test_llm_api_pytorch_multimodal.py::TestStep3_7::test_fp8_block_scales[mtp_nextn=3] SKIP (https://nvbugs/6274932) -full:H20/disaggregated/test_disaggregated.py::test_disaggregated_overlap_gen_first[ctx_pp4-TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6344107) -full:L40S/accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[triton-False-1] SKIP (https://nvbugs/6322045) -full:L40S/accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[trtllm-False-1] SKIP (https://nvbugs/6322045) -full:L40S/accuracy/test_llm_api_autodeploy.py::TestModelRegistryAccuracy::test_autodeploy_from_registry[nvidia_Llama-3.1-8B-Instruct-FP8-True] SKIP (https://nvbugs/6327143) -full:L40S/accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[trtllm-flashinfer_ssm-False] SKIP (https://nvbugs/6327147) -full:L40S/accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[trtllm-triton_ssm-False] SKIP (https://nvbugs/6327147) -full:L40S/accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=True] SKIP (https://nvbugs/6276841) -full:L40S/accuracy/test_llm_api_pytorch_multimodal.py::TestExaone4_5_33B::test_auto_dtype[forced_chunked_prefill] SKIP (https://nvbugs/6327149) -full:L40S/accuracy/test_llm_api_pytorch_multimodal.py::TestExaone4_5_33B::test_auto_dtype[full_budget] SKIP (https://nvbugs/6327149) -full:L40S/disaggregated/test_workers.py::test_workers_conditional_disaggregation_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6329052) -full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5948435) -full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/5961814) -full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5961814) -full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] SKIP (https://nvbugs/5929339) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[v1_kv_cache-trtllm-one_model-no_overlap_scheduler] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[v1_kv_cache-trtllm-one_model-overlap_scheduler] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[v1_kv_cache-trtllm-two_model-no_overlap_scheduler] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[v1_kv_cache-trtllm-two_model-overlap_scheduler] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[v2_kv_cache-cutlass-two_model-no_overlap_scheduler] SKIP (https://nvbugs/6223530) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[v2_kv_cache-trtllm-one_model-no_overlap_scheduler] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[v2_kv_cache-trtllm-one_model-overlap_scheduler] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[v2_kv_cache-trtllm-two_model-no_overlap_scheduler] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-cutlass-fp8] SKIP (https://nvbugs/6273845) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-trtllm-auto] SKIP (https://nvbugs/6273846) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-trtllm-fp8] SKIP (https://nvbugs/6273846) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v2_kv_cache-True-True-trtllm-auto] SKIP (https://nvbugs/6273846) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v2_kv_cache-True-True-trtllm-fp8] SKIP (https://nvbugs/6273846) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-trtllm-auto] SKIP (https://nvbugs/6273846) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-trtllm-fp8] SKIP (https://nvbugs/6273846) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-trtllm-auto] SKIP (https://nvbugs/6273846) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-trtllm-fp8] SKIP (https://nvbugs/6273846) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-trtllm-auto] SKIP (https://nvbugs/6273846) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-trtllm-fp8] SKIP (https://nvbugs/6273846) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-trtllm-auto] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-trtllm-auto] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-ep4-trtllm-fp8] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-trtllm-auto] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-trtllm-fp8] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-trtllm-auto] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-trtllm-auto] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-trtllm-fp8] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-tp4-trtllm-auto] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache_no_reuse-tp4-trtllm-auto] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache_no_reuse-tp4-trtllm-fp8] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8] SKIP (https://nvbugs/6316152) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_bf16[tp1-CUTLASS] SKIP (https://nvbugs/6273850) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_bf16_mtp SKIP (https://nvbugs/6275856) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8[enable_block_reuse=False] SKIP (https://nvbugs/6313076) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestQwen3_5_4B::test_dflash SKIP (https://nvbugs/6273850) -full:RTX_6000D/accuracy/test_llm_api_pytorch.py::TestQwen3_5_4B::test_fp8 SKIP (https://nvbugs/6273850) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6313072) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True-enable_chunked_prefill=False-v2_kv_cache=False] SKIP (https://nvbugs/6313072) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6313072) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=False] SKIP (https://nvbugs/6313072) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_batch_waiting[batch_wait_timeout_iters=10-batch_wait_max_tokens_ratio=1.0-mtp_nextn=0-fp8kv=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-v2_kv_cache=True] SKIP (https://nvbugs/6313072) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-cutlass-fp8] SKIP (https://nvbugs/6273845) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-trtllm-auto] SKIP (https://nvbugs/6273846) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-trtllm-fp8] SKIP (https://nvbugs/6273846) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v2_kv_cache-True-True-trtllm-auto] SKIP (https://nvbugs/6273846) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v2_kv_cache-True-True-trtllm-fp8] SKIP (https://nvbugs/6273846) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-trtllm-auto] SKIP (https://nvbugs/6273846) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-trtllm-fp8] SKIP (https://nvbugs/6273846) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-trtllm-auto] SKIP (https://nvbugs/6273846) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-trtllm-fp8] SKIP (https://nvbugs/6273846) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-trtllm-auto] SKIP (https://nvbugs/6273846) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-trtllm-fp8] SKIP (https://nvbugs/6273846) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_bf16[tp1-CUTLASS] SKIP (https://nvbugs/6273850) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_bf16_mtp SKIP (https://nvbugs/6275856) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8[enable_block_reuse=False] SKIP (https://nvbugs/6313076) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestQwen3_5_4B::test_bf16 SKIP (https://nvbugs/6273850) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestQwen3_5_4B::test_dflash SKIP (https://nvbugs/6273850) -full:RTX_PRO_6000_Blackwell_Server_Edition/accuracy/test_llm_api_pytorch.py::TestQwen3_5_4B::test_fp8 SKIP (https://nvbugs/6273850) -full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:int8_sq_per_tensor] SKIP (https://nvbugs/5161074) -full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugs/5161074) -full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:w4a8_awq] SKIP (https://nvbugs/5161074) -full:sm100/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (megatron-core 0.8 is not supported in python 3.12) -full:sm100/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (megatron-core 0.8 is not supported in python 3.12) -full:sm100/unittest/bindings SKIP (Disable for Blackwell) -full:sm100/unittest/llmapi/test_llm_models.py -m "not (part0 or part1)" SKIP (Disable for Blackwell OOM) -full:sm100/unittest/llmapi/test_llm_models.py -m "part0" SKIP (Disable for Blackwell for context fmha doesn't support when headsize is 80/96) -full:sm100/unittest/test_model_runner_cpp.py SKIP (Disable for Blackwell) -full:sm100/unittest/trt/attention/test_bert_attention.py SKIP (Disable for Blackwell) -full:sm100/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell) -full:sm100/unittest/trt/functional SKIP (Disable for Blackwell) -full:sm100/unittest/trt/model/test_gpt.py -k "partition0" SKIP (Disable for Blackwell) -full:sm100/unittest/trt/model/test_mamba.py SKIP (Disable for Blackwell) -full:sm100/unittest/trt/quantization SKIP (Disable for Blackwell) -full:sm100/unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py SKIP (Disable for Blackwell) -full:sm100/unittest/trt/quantization/test_weight_only_quant_matmul.py SKIP (Disable for Blackwell) -kv_cache/test_prefix_aware_scheduling.py::TestServePrefixAwareScheduling::test_multi_round_qa_shared_prefix_smoke SKIP (https://nvbugs/6266306) -llmapi/test_llm_api_pytorch_bart.py::test_bart_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v1-cuda-graph-off-beam2-bart-large-cnn] SKIP (https://nvbugs/6340115) -llmapi/test_llm_api_pytorch_bart.py::test_bart_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v1-cuda-graph-off-greedy-bart-large-cnn] SKIP (https://nvbugs/6340115) -llmapi/test_llm_api_pytorch_bart.py::test_bart_pytorch_generate_encoder_decoder_end_to_end[fp16-kv-v1-cuda-graph-off-greedy-bart-large-cnn] SKIP (https://nvbugs/6340115) -llmapi/test_llm_api_pytorch_bart.py::test_bart_pytorch_generate_encoder_decoder_mixed_encoder_lengths_batch[bf16-kv-v2-cuda-graph-off-greedy-batch2-bart-large-cnn] SKIP (https://nvbugs/6340115) -llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v1-cuda-graph-off-beam2-flan-t5-xl] SKIP (https://nvbugs/6340115) -llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v1-cuda-graph-off-beam2-t5-base] SKIP (https://nvbugs/6340115) -llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v1-cuda-graph-off-beam2-t5-small0] SKIP (https://nvbugs/6340115) -llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v1-cuda-graph-off-greedy-t5-small] SKIP (https://nvbugs/6340115) -llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v2-cuda-graph-off-greedy-flan-t5-small] SKIP (https://nvbugs/6340115) -llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[bf16-kv-v2-cuda-graph-off-greedy-t5-small] SKIP (https://nvbugs/6340115) -llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[fp16-kv-v1-cuda-graph-off-beam2-flan-t5-small] SKIP (https://nvbugs/6340115) -llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[fp16-kv-v2-cuda-graph-off-greedy-flan-t5-small] SKIP (https://nvbugs/6340115) -llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[fp32-kv-v1-cuda-graph-off-beam2-flan-t5-small] SKIP (https://nvbugs/6340115) -llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[fp32-kv-v1-cuda-graph-off-beam2-t5-small] SKIP (https://nvbugs/6340115) -llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_end_to_end[fp32-kv-v2-cuda-graph-off-greedy-t5-small] SKIP (https://nvbugs/6340115) -llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_mixed_encoder_lengths_batch[bf16-kv-v1-cuda-graph-off-beam2-batch2-flan-t5-small] SKIP (https://nvbugs/6340115) -llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_mixed_encoder_lengths_batch[bf16-kv-v1-cuda-graph-off-beam2-batch2-t5-small] SKIP (https://nvbugs/6340115) -llmapi/test_llm_api_pytorch_t5.py::test_t5_pytorch_generate_encoder_decoder_mixed_encoder_lengths_batch[bf16-kv-v2-cuda-graph-off-greedy-batch2-t5-small] SKIP (https://nvbugs/6340115) -llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_eagle3 SKIP (https://nvbugs/6075431) -llmapi/test_llm_examples.py::test_llmapi_tensorrt_engine SKIP (https://nvbugs/5820553) -perf/test_perf.py::test_perf[bart_large_cnn-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449) -perf/test_perf.py::test_perf[flan_t5_base-bench-float16-input_output_len:128,20] SKIP -perf/test_perf.py::test_perf[flan_t5_base-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449) -perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20-gpus:2] SKIP -perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20] SKIP -perf/test_perf.py::test_perf[flan_t5_large-bench-float16-maxbs:1-input_output_len:128,20-gpus:2] SKIP -perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,128-reqs:80-gpus:8] SKIP -perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8] SKIP -perf/test_perf.py::test_perf[mamba_2.8b-bench-float16-input_output_len:128,128] SKIP -perf/test_perf.py::test_perf[mamba_2.8b-bench-float16-input_output_len:512,32] SKIP -perf/test_perf.py::test_perf[mamba_370m-bench-float16-input_output_len:128,128] SKIP -perf/test_perf.py::test_perf[mamba_370m-bench-float16-input_output_len:512,32] SKIP -perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20-gpus:2] SKIP -perf/test_perf.py::test_perf[t5-bench-float16-maxbs:1-input_output_len:128,20-gpus:2] SKIP -perf/test_perf.py::test_perf[t5_base-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449) -perf/test_perf.py::test_perf[whisper_large_v3-bench-float16-input_output_len:128,20] SKIP -perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_blackwell-v32_fp4_dep8_mtp1_8k1k] SKIP (https://nvbugs/6280721) -perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_blackwell-v32_fp4_tep8_mtp3_8k1k] SKIP (https://nvbugs/6280721) -perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_grace_blackwell-v32_fp4_dep4_mtp1_8k1k] SKIP (https://nvbugs/6323889) -perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_grace_blackwell-v32_fp4_tep4_mtp3_1k1k] SKIP (https://nvbugs/6323889) -perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_grace_blackwell-v32_fp4_tep4_mtp3_8k1k] SKIP (https://nvbugs/6280721) -perf/test_perf_sanity.py::test_e2e[aggr_upload-glm5_fp4_2_nodes_grace_blackwell-glm5_fp4_dep8_mtp1_8k1k] SKIP (https://nvbugs/6324131) -perf/test_perf_sanity.py::test_e2e[aggr_upload-glm5_fp4_blackwell-glm5_fp4_dep8_mtp1_8k1k] SKIP (https://nvbugs/6329155) -perf/test_perf_sanity.py::test_e2e[aggr_upload-glm5_fp4_blackwell-glm5_fp4_tep8_mtp3_8k1k] SKIP (https://nvbugs/6329155) -perf/test_perf_sanity.py::test_e2e[aggr_upload-super_ad_blackwell-super_ad_ws1_1k1k] SKIP (https://nvbugs/6153575) -perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6302903) -perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6280649) -perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6302880) -perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb300_glm-5-fp4_1k1k_con4096_ctx1_dep2_gen1_dep8_eplb256_mtp1_ccb-NIXL] SKIP (https://nvbugs/6324131) -perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb300_glm-5-fp4_8k1k_con1024_ctx1_dep2_gen1_dep8_eplb256_mtp1_ccb-NIXL] SKIP (https://nvbugs/6324131) -perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb300_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6368078) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL] SKIP (https://nvbugs/6302903) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL] SKIP (https://nvbugs/6302903) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6323889) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb0_mtp3_ccb-NIXL] SKIP (https://nvbugs/6323889) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-NIXL] SKIP (https://nvbugs/6323889) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6323889) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6302903) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] SKIP (https://nvbugs/6280721) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] SKIP (https://nvbugs/6280721) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL] SKIP (https://nvbugs/6302903) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6324123) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6324123) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6287834) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6324123) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6323074) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb300_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6323889) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb300_glm-5-fp4_1k1k_con4096_ctx1_dep2_gen1_dep8_eplb256_mtp1_ccb-NIXL] SKIP (https://nvbugs/6324131) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb300_glm-5-fp4_8k1k_con1024_ctx1_dep2_gen1_dep8_eplb256_mtp1_ccb-NIXL] SKIP (https://nvbugs/6324131) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb300_glm-5-fp4_8k1k_con1_ctx1_dep2_gen1_tep8_eplb0_mtp3_ccb-NIXL] SKIP (https://nvbugs/6324131) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb300_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6323074) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb300_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6323074) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb300_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb416_mtp3_ccb-NIXL] SKIP (https://nvbugs/6323074) -perf/test_visual_gen_perf_sanity.py::test_visual_gen_e2e[vg_upload-ltx2_blackwell-ltx2_2stage_bf16_i2v_cfg2_ulysses4_compile_on] SKIP (https://nvbugs/6294413) -perf/test_visual_gen_perf_sanity.py::test_visual_gen_e2e[vg_upload-ltx2_blackwell-ltx2_2stage_bf16_t2v_cfg2_ulysses4_compile_on] SKIP (https://nvbugs/6294413) -perf/test_visual_gen_perf_sanity.py::test_visual_gen_e2e[vg_upload-ltx2_blackwell-ltx2_nvfp4_i2v_cfg2_ulysses4_compile_on] SKIP (https://nvbugs/6294413) -test_doc.py::test_url_validity SKIP (https://nvbugs/6215684) -test_e2e.py::test_draft_token_tree_quickstart_advanced_eagle3[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B] SKIP (https://nvbugs/6368053) -test_e2e.py::test_draft_token_tree_quickstart_advanced_eagle3_depth_1_tree[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B] SKIP (https://nvbugs/6368053) -test_e2e.py::test_multi_nodes_eval[DeepSeek-R1/DeepSeek-R1-0528-FP4-tp16-mmlu] SKIP (https://nvbugs/6276983) -test_e2e.py::test_multi_nodes_eval[Kimi-K2-Thinking-NVFP4-tp16-mmlu] SKIP (https://nvbugs/6276983) -test_e2e.py::test_openai_chat_example[trt] SKIP (https://nvbugs/5477444) -test_e2e.py::test_openai_completions_example[trt] SKIP (https://nvbugs/5701450) -test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_w4afp8_8gpus[DeepSeek-R1-W4AFP8-DeepSeek-R1/DeepSeek-R1-W4AFP8] SKIP (https://nvbugs/5836830) -test_e2e.py::test_trtllm_bench_iteration_log[TRT-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] SKIP (https://nvbugs/5448523) -triton_server/test_triton.py::test_cpp_unit_tests[cpp-unit-tests] SKIP (https://nvbugs/5619359) -triton_server/test_triton.py::test_eagle[eagle] SKIP (https://nvbugs/5477378) -triton_server/test_triton.py::test_gpt_2b_ib_lora[gpt-2b-ib-lora] SKIP (https://nvbugs/5470830) -triton_server/test_triton.py::test_gpt_disaggregated_serving_bls[gpt-disaggregated-serving-bls] SKIP (https://nvbugs/5582118) -triton_server/test_triton.py::test_gpt_gather_logits[gpt-gather-logits] SKIP (https://nvbugs/5766960) -triton_server/test_triton.py::test_gpt_ib[gpt-ib] SKIP (https://nvbugs/5431116) -triton_server/test_triton.py::test_gpt_ib_lad[gpt-ib-lad] SKIP (https://nvbugs/5775223) -triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning] SKIP (https://nvbugs/5445624) -triton_server/test_triton.py::test_gpt_ib_speculative_decoding_bls[gpt-ib-speculative-decoding-bls] SKIP -triton_server/test_triton.py::test_gpt_ib_streaming[gpt-ib-streaming] SKIP (https://nvbugs/5371349) -triton_server/test_triton.py::test_gpt_speculative_decoding[gpt-speculative-decoding] SKIP (https://nvbugs/5762854) -triton_server/test_triton.py::test_llava[llava] SKIP (https://nvbugs/5547414) -triton_server/test_triton.py::test_llava_onevision[llava_onevision] SKIP (https://nvbugs/5775205) -triton_server/test_triton.py::test_mistral_ib[mistral-ib] SKIP (https://nvbugs/5477399) -triton_server/test_triton.py::test_mistral_ib_mm[mistral-ib-mm] SKIP (https://nvbugs/5371343) -triton_server/test_triton.py::test_mllama[mllama] SKIP (https://nvbugs/5333818) -triton_server/test_triton.py::test_python_bls_unit_tests[python-bls-unit-tests] SKIP (https://nvbugs/5477392) -triton_server/test_triton.py::test_qwen2_vl[qwen2_vl] SKIP -triton_server/test_triton.py::test_t5_ib[t5-ib] SKIP (https://nvbugs/5456482) -triton_server/test_triton.py::test_triton_extensive[triton-extensive] SKIP -triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-tensorrt_llm] SKIP (https://nvbugs/5701480) -triton_server/test_triton_llm.py::test_mistral_v1_multi_models[False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization-4096--1-1-1-False-ensemble] SKIP -triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1---False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble] SKIP (https://nvbugs/5619369) -unittest/_torch/misc/test_autotuner.py::test_autotuner_distributed_strategy SKIP (https://nvbugs/6321874) -unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend[act=Relu2-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=TRTLLM-quant=NVFP4-routing=Renormalize] SKIP (https://nvbugs/5989912) -unittest/_torch/modules/tests_lora_modules/test_lora_attention_pytorch_flow_vs_trt.py::TestLoraAttentionPytorchFlowVsTRT::test_lora_attention SKIP (https://nvbugs/5701421) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-bf16-_tokens16-_hidden32] SKIP (https://nvbugs/6266259) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-bf16-_tokens16-_hidden512] SKIP (https://nvbugs/6266259) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-bf16-_tokens256-_hidden32] SKIP (https://nvbugs/6266259) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-bf16-_tokens256-_hidden512] SKIP (https://nvbugs/6266259) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens16-_hidden32] SKIP (https://nvbugs/6266259) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens16-_hidden512] SKIP (https://nvbugs/6266259) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens256-_hidden32] SKIP (https://nvbugs/6266259) -unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens256-_hidden512] SKIP (https://nvbugs/6266259) -unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingDSv3-swiglu-1024-1024-1] SKIP (https://nvbugs/5908070) -unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingRenormalize_qwen_next-swiglu-1024-1024-150] SKIP (https://nvbugs/5908070) -unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingRenormalize_topk_4-swiglu-1024-1024-150] SKIP (https://nvbugs/5908070) -unittest/bindings/test_transfer_agent_bindings.py::TestNixlFunctionalTransfer::test_nixl_wait_in_progress_on_zero_timeout SKIP (https://nvbugs/6260897) -unittest/executor/test_rpc.py::TestRpcCorrectness::test_incremental_task_async SKIP (https://nvbugs/5741476) -unittest/executor/test_rpc_proxy.py SKIP (https://nvbugs/5605741) -unittest/executor/test_rpc_worker.py SKIP (https://nvbugs/5605741) -unittest/llmapi/test_llm_multi_gpu.py -m "gpu4 and part0" SKIP (https://nvbugs/5348958) -unittest/llmapi/test_llm_multi_gpu_pytorch.py::test_phi3_lora_fused_modules_output_on_tp2_identical_to_tp1 SKIP (https://nvbugs/6109745) -unittest/llmapi/test_llm_pytorch.py::test_qwen_moe_routed_expert_multi_lora_varying_ranks SKIP (https://nvbugs/6335726) -unittest/llmapi/test_memory_profiling.py::test_profile_kvcache SKIP (https://nvbugs/5580781) -unittest/tools/test_layer_wise_benchmarks.py::test_performance_alignment[1] SKIP (https://nvbugs/6127669) -unittest/tools/test_layer_wise_benchmarks.py::test_qwen3_next_gen_tep[1] SKIP (https://nvbugs/6153575) -verl/test_verl_cases.py::test_trtllm_abort SKIP (https://nvbugs/6272653)