Skip to content

Commit 183f4f7

Browse files
committed
temporary test changes for iteration time
Signed-off-by: William Zhang <133824995+2ez4bz@users.noreply.github.com>
1 parent ebb732c commit 183f4f7

5 files changed

Lines changed: 79 additions & 920 deletions

File tree

jenkins/L0_Test.groovy

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4467,15 +4467,7 @@ def launchTestJobs(pipeline, testFilter)
44674467
"DGX_H100-4_GPUs-PyTorch-Ray-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
44684468
"DGX_H100-4_GPUs-AutoDeploy-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
44694469
"DGX_H100-4_GPUs-AutoDeploy-Post-Merge-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
4470-
"DGX_B200-PyTorch-1": ["auto:dgx-b200-flex", "l0_b200", 1, 9, 1, 1, true],
4471-
"DGX_B200-PyTorch-2": ["auto:dgx-b200-flex", "l0_b200", 2, 9, 1, 1, true],
4472-
"DGX_B200-PyTorch-3": ["auto:dgx-b200-flex", "l0_b200", 3, 9, 1, 1, true],
4473-
"DGX_B200-PyTorch-4": ["auto:dgx-b200-flex", "l0_b200", 4, 9, 1, 1, true],
4474-
"DGX_B200-PyTorch-5": ["auto:dgx-b200-flex", "l0_b200", 5, 9, 1, 1, true],
4475-
"DGX_B200-PyTorch-6": ["auto:dgx-b200-flex", "l0_b200", 6, 9, 1, 1, true],
4476-
"DGX_B200-PyTorch-7": ["auto:dgx-b200-flex", "l0_b200", 7, 9, 1, 1, true],
4477-
"DGX_B200-PyTorch-8": ["auto:dgx-b200-flex", "l0_b200", 8, 9, 1, 1, true],
4478-
"DGX_B200-PyTorch-9": ["auto:dgx-b200-flex", "l0_b200", 9, 9, 1, 1, true],
4470+
"DGX_B200-PyTorch-1": ["auto:dgx-b200-flex", "l0_b200", 1, 1, 1, 1, true],
44794471
"DGX_B200-AutoDeploy-1": ["auto:dgx-b200-flex", "l0_b200", 1, 1, 1, 1, true],
44804472
"DGX_B200-Triton-Post-Merge-1": ["auto:dgx-b200-flex", "l0_b200", 1, 1, 1, 1, true],
44814473
"DGX_B200-PyTorch-Post-Merge-1": ["auto:dgx-b200-flex", "l0_b200", 1, 2, 1, 1, true],

scripts/check_test_list.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,11 @@ def verify_waive_list(llm_src, args):
720720
with open(tmp_waives_file, "w") as f:
721721
f.writelines(f"{line}\n" for line in sorted(processed_lines))
722722

723+
if not processed_lines:
724+
print("No integration waive entries found; skipping collection.",
725+
flush=True)
726+
return
727+
723728
subprocess.run(
724729
f"cd {llm_src}/tests/integration/defs && "
725730
f"pytest --test-list={tmp_waives_file} --output-dir={llm_src} -s --co -q",

tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py

Lines changed: 62 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
MTPDecodingConfig,
2424
SamplingParams,
2525
)
26-
from tensorrt_llm.llmapi.llm_args import MultimodalConfig, MultimodalEncoderCudaGraphConfig
26+
from tensorrt_llm.llmapi.llm_args import MultimodalConfig
2727
from tensorrt_llm.quantization import QuantAlgo
2828

2929
from ..conftest import (
@@ -605,132 +605,75 @@ def test_auto_dtype(self, max_num_tokens):
605605
task.evaluate(llm, sampling_params=self.sampling_params)
606606

607607

608+
# The score here may be lower than VLMEvalKitMcore (official) runs. This path uses
609+
# lm_eval's MMMU task, prompt formatting, and scoring, while VLMEvalKitMcore
610+
# uses MMMU_DEV_VAL with its own MCQ prompt builder, answer extraction, and
611+
# explicit image tiling/token accounting in the Mcore wrapper.
612+
# We also keep the generation budget small for CI speed, and this evaluator
613+
# does not strip reasoning traces after </think> before scoring. If the model
614+
# ignores the non-thinking directive, answer extraction may see the reasoning.
615+
EXTRA_EVALUATOR_KWARGS = dict(
616+
apply_chat_template=True,
617+
is_multimodal=True,
618+
)
619+
620+
# NOTE: MMMU adds <|endoftext|> to the stop token.
621+
sampling_params = SamplingParams(
622+
max_tokens=MMMU.MAX_OUTPUT_LEN,
623+
truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
624+
stop="<|endoftext|>",
625+
temperature=0.0,
626+
top_k=1,
627+
)
628+
MMMU_TASK_SPEC = (MMMU, sampling_params, EXTRA_EVALUATOR_KWARGS)
629+
630+
voxpopuli_sampling_params = SamplingParams(
631+
max_tokens=512,
632+
truncate_prompt_tokens=VoxPopuli.MAX_INPUT_LEN,
633+
temperature=0.0,
634+
top_k=1,
635+
)
636+
no_thinking_evaluator_kwargs = {
637+
# We explicitly disable thinking, because otherwise the thinking traces could
638+
# be absurdly long (20k+ tokens), which is not helpful for test-runtime, nor
639+
# for reproducibility (the more tokens there are, the higher likelihood of the
640+
# end output not being the same).
641+
# In addition, if reasoning is cut off, then the WER goes through the roof,
642+
# since each word in the output is treated as an error.
643+
"chat_template_kwargs": {"enable_thinking": False},
644+
}
645+
VOXPOPULI_TASK_SPEC = (
646+
VoxPopuli,
647+
voxpopuli_sampling_params,
648+
no_thinking_evaluator_kwargs,
649+
)
650+
651+
videomme_sampling_params = SamplingParams(
652+
max_tokens=VideoMME.MAX_OUTPUT_LEN,
653+
truncate_prompt_tokens=VideoMME.MAX_INPUT_LEN,
654+
temperature=0.0,
655+
top_k=1,
656+
)
657+
VIDEOMME_TASK_SPEC = (
658+
VideoMME,
659+
videomme_sampling_params,
660+
no_thinking_evaluator_kwargs,
661+
)
662+
663+
608664
# Skip for B300 / GB300:
609665
# * B300 coverage does not meaningfully extend what we test via B200.
610666
# * GB300 may not be entirely up to date for `llm-models`, leading to repo-wide CI errors.
611667
@skip_post_blackwell_ultra
612668
class TestNanoV3Omni(LlmapiAccuracyTestHarness):
613-
# The score here may be lower than VLMEvalKitMcore (official) runs. This path uses
614-
# lm_eval's MMMU task, prompt formatting, and scoring, while VLMEvalKitMcore
615-
# uses MMMU_DEV_VAL with its own MCQ prompt builder, answer extraction, and
616-
# explicit image tiling/token accounting in the Mcore wrapper.
617-
# We also keep the generation budget small for CI speed, and this evaluator
618-
# does not strip reasoning traces after </think> before scoring. If the model
619-
# ignores the non-thinking directive, answer extraction may see the reasoning.
620-
EXTRA_EVALUATOR_KWARGS = dict(
621-
apply_chat_template=True,
622-
is_multimodal=True,
623-
)
624-
625-
# NOTE: MMMU adds <|endoftext|> to the stop token.
626-
sampling_params = SamplingParams(
627-
max_tokens=MMMU.MAX_OUTPUT_LEN,
628-
truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
629-
stop="<|endoftext|>",
630-
temperature=0.0,
631-
top_k=1,
632-
)
633-
MMMU_TASK_SPEC = (MMMU, sampling_params, EXTRA_EVALUATOR_KWARGS)
634-
635-
voxpopuli_sampling_params = SamplingParams(
636-
max_tokens=512,
637-
truncate_prompt_tokens=VoxPopuli.MAX_INPUT_LEN,
638-
temperature=0.0,
639-
top_k=1,
640-
)
641-
no_thinking_evaluator_kwargs = {
642-
# We explicitly disable thinking, because otherwise the thinking traces could
643-
# be absurdly long (20k+ tokens), which is not helpful for test-runtime, nor
644-
# for reproducibility (the more tokens there are, the higher likelihood of the
645-
# end output not being the same).
646-
# In addition, if reasoning is cut off, then the WER goes through the roof,
647-
# since each word in the output is treated as an error.
648-
"chat_template_kwargs": {"enable_thinking": False},
649-
}
650-
VOXPOPULI_TASK_SPEC = (
651-
VoxPopuli,
652-
voxpopuli_sampling_params,
653-
no_thinking_evaluator_kwargs,
654-
)
655-
656-
videomme_sampling_params = SamplingParams(
657-
max_tokens=VideoMME.MAX_OUTPUT_LEN,
658-
truncate_prompt_tokens=VideoMME.MAX_INPUT_LEN,
659-
temperature=0.0,
660-
top_k=1,
661-
)
662-
VIDEOMME_TASK_SPEC = (
663-
VideoMME,
664-
videomme_sampling_params,
665-
no_thinking_evaluator_kwargs,
666-
)
667-
668669
@pytest.mark.skip_less_device_memory(80000)
669670
@pytest.mark.parametrize(
670671
(
671672
"model_name,model_path,kv_cache_config,max_batch_size,"
672673
"expected_quant_algo,task_specs,multimodal_config"
673674
),
674675
[
675-
pytest.param(
676-
"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16",
677-
f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16",
678-
KvCacheConfig(
679-
free_gpu_memory_fraction=0.8,
680-
mamba_ssm_cache_dtype="float32",
681-
enable_block_reuse=False,
682-
),
683-
32,
684-
None,
685-
(MMMU_TASK_SPEC,),
686-
None,
687-
id="bf16",
688-
),
689-
pytest.param(
690-
"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8",
691-
f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8",
692-
KvCacheConfig(
693-
free_gpu_memory_fraction=0.8,
694-
mamba_ssm_cache_dtype="float32",
695-
enable_block_reuse=False,
696-
dtype="fp8",
697-
),
698-
64,
699-
QuantAlgo.FP8,
700-
(MMMU_TASK_SPEC, VOXPOPULI_TASK_SPEC, VIDEOMME_TASK_SPEC),
701-
None,
702-
marks=skip_pre_hopper,
703-
id="fp8",
704-
),
705-
pytest.param(
706-
"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8",
707-
f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8",
708-
KvCacheConfig(
709-
free_gpu_memory_fraction=0.8,
710-
mamba_ssm_cache_dtype="float32",
711-
enable_block_reuse=False,
712-
dtype="fp8",
713-
),
714-
64,
715-
QuantAlgo.FP8,
716-
(MMMU_TASK_SPEC,),
717-
MultimodalConfig(
718-
encoder_cuda_graph={
719-
"vision": MultimodalEncoderCudaGraphConfig(
720-
# Uncomment to debug (logs will show hits / misses), which is how the
721-
# below buckets were determined.
722-
# enable_replay_stats=True,
723-
buckets=[
724-
(1280, 1),
725-
(4096, 1),
726-
(5500, 2),
727-
],
728-
)
729-
},
730-
),
731-
marks=skip_pre_hopper,
732-
id="fp8_mmmu_encoder_cuda_graph",
733-
),
676+
# TEMPORARY: duplicate the flaky NVFP4 case for B200 CI iteration.
734677
pytest.param(
735678
"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
736679
f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
@@ -740,13 +683,14 @@ class TestNanoV3Omni(LlmapiAccuracyTestHarness):
740683
enable_block_reuse=False,
741684
dtype="fp8",
742685
),
743-
128,
686+
64,
744687
QuantAlgo.MIXED_PRECISION,
745688
(MMMU_TASK_SPEC, VOXPOPULI_TASK_SPEC, VIDEOMME_TASK_SPEC),
746689
None,
747690
marks=(skip_pre_blackwell,),
748-
id="nvfp4",
749-
),
691+
id=f"nvfp4_repeat_{i}",
692+
)
693+
for i in range(1, 11)
750694
],
751695
)
752696
# `torch.compile` uses a thread pool to compile and it's used in audio pre-processing.

0 commit comments

Comments
 (0)