Skip to content

Commit 740a40d

Browse files
committed
temporary test changes for iteration time
Signed-off-by: William Zhang <133824995+2ez4bz@users.noreply.github.com>
1 parent 5b09793 commit 740a40d

5 files changed

Lines changed: 93 additions & 860 deletions

File tree

jenkins/L0_Test.groovy

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4467,15 +4467,7 @@ def launchTestJobs(pipeline, testFilter)
44674467
"DGX_H100-4_GPUs-PyTorch-Ray-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
44684468
"DGX_H100-4_GPUs-AutoDeploy-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
44694469
"DGX_H100-4_GPUs-AutoDeploy-Post-Merge-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
4470-
"DGX_B200-PyTorch-1": ["auto:dgx-b200-flex", "l0_b200", 1, 9, 1, 1, true],
4471-
"DGX_B200-PyTorch-2": ["auto:dgx-b200-flex", "l0_b200", 2, 9, 1, 1, true],
4472-
"DGX_B200-PyTorch-3": ["auto:dgx-b200-flex", "l0_b200", 3, 9, 1, 1, true],
4473-
"DGX_B200-PyTorch-4": ["auto:dgx-b200-flex", "l0_b200", 4, 9, 1, 1, true],
4474-
"DGX_B200-PyTorch-5": ["auto:dgx-b200-flex", "l0_b200", 5, 9, 1, 1, true],
4475-
"DGX_B200-PyTorch-6": ["auto:dgx-b200-flex", "l0_b200", 6, 9, 1, 1, true],
4476-
"DGX_B200-PyTorch-7": ["auto:dgx-b200-flex", "l0_b200", 7, 9, 1, 1, true],
4477-
"DGX_B200-PyTorch-8": ["auto:dgx-b200-flex", "l0_b200", 8, 9, 1, 1, true],
4478-
"DGX_B200-PyTorch-9": ["auto:dgx-b200-flex", "l0_b200", 9, 9, 1, 1, true],
4470+
"DGX_B200-PyTorch-1": ["auto:dgx-b200-flex", "l0_b200", 1, 1, 1, 1, true],
44794471
"DGX_B200-AutoDeploy-1": ["auto:dgx-b200-flex", "l0_b200", 1, 1, 1, 1, true],
44804472
"DGX_B200-Triton-Post-Merge-1": ["auto:dgx-b200-flex", "l0_b200", 1, 1, 1, 1, true],
44814473
"DGX_B200-PyTorch-Post-Merge-1": ["auto:dgx-b200-flex", "l0_b200", 1, 2, 1, 1, true],

scripts/check_test_list.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,11 @@ def verify_waive_list(llm_src, args):
720720
with open(tmp_waives_file, "w") as f:
721721
f.writelines(f"{line}\n" for line in sorted(processed_lines))
722722

723+
if not processed_lines:
724+
print("No integration waive entries found; skipping collection.",
725+
flush=True)
726+
return
727+
723728
subprocess.run(
724729
f"cd {llm_src}/tests/integration/defs && "
725730
f"pytest --test-list={tmp_waives_file} --output-dir={llm_src} -s --co -q",

tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py

Lines changed: 76 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -605,66 +605,67 @@ def test_auto_dtype(self, max_num_tokens):
605605
task.evaluate(llm, sampling_params=self.sampling_params)
606606

607607

608+
# The score here may be lower than VLMEvalKitMcore (official) runs. This path uses
609+
# lm_eval's MMMU task, prompt formatting, and scoring, while VLMEvalKitMcore
610+
# uses MMMU_DEV_VAL with its own MCQ prompt builder, answer extraction, and
611+
# explicit image tiling/token accounting in the Mcore wrapper.
612+
# We also keep the generation budget small for CI speed, and this evaluator
613+
# does not strip reasoning traces after </think> before scoring. If the model
614+
# ignores the non-thinking directive, answer extraction may see the reasoning.
615+
EXTRA_EVALUATOR_KWARGS = dict(
616+
apply_chat_template=True,
617+
is_multimodal=True,
618+
)
619+
620+
# NOTE: MMMU adds <|endoftext|> to the stop token.
621+
sampling_params = SamplingParams(
622+
max_tokens=MMMU.MAX_OUTPUT_LEN,
623+
truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
624+
stop="<|endoftext|>",
625+
temperature=0.0,
626+
top_k=1,
627+
)
628+
MMMU_TASK_SPEC = (MMMU, sampling_params, EXTRA_EVALUATOR_KWARGS)
629+
630+
voxpopuli_sampling_params = SamplingParams(
631+
max_tokens=512,
632+
truncate_prompt_tokens=VoxPopuli.MAX_INPUT_LEN,
633+
temperature=0.0,
634+
top_k=1,
635+
)
636+
no_thinking_evaluator_kwargs = {
637+
# We explicitly disable thinking, because otherwise the thinking traces could
638+
# be absurdly long (20k+ tokens), which is not helpful for test-runtime, nor
639+
# for reproducibility (the more tokens there are, the higher likelihood of the
640+
# end output not being the same).
641+
# In addition, if reasoning is cut off, then the WER goes through the roof,
642+
# since each word in the output is treated as an error.
643+
"chat_template_kwargs": {"enable_thinking": False},
644+
}
645+
VOXPOPULI_TASK_SPEC = (
646+
VoxPopuli,
647+
voxpopuli_sampling_params,
648+
no_thinking_evaluator_kwargs,
649+
)
650+
651+
videomme_sampling_params = SamplingParams(
652+
max_tokens=VideoMME.MAX_OUTPUT_LEN,
653+
truncate_prompt_tokens=VideoMME.MAX_INPUT_LEN,
654+
temperature=0.0,
655+
top_k=1,
656+
)
657+
VIDEOMME_TASK_SPEC = (
658+
VideoMME,
659+
videomme_sampling_params,
660+
no_thinking_evaluator_kwargs,
661+
)
662+
663+
608664
# Skip for B300 / GB300:
609665
# * B300 coverage does not meaningfully extend what we test via B200.
610666
# * GB300 may not be entirely up to date for `llm-models`, leading to repo-wide CI errors.
611667
@skip_post_blackwell_ultra
612668
class TestNanoV3Omni(LlmapiAccuracyTestHarness):
613-
# The score here may be lower than VLMEvalKitMcore (official) runs. This path uses
614-
# lm_eval's MMMU task, prompt formatting, and scoring, while VLMEvalKitMcore
615-
# uses MMMU_DEV_VAL with its own MCQ prompt builder, answer extraction, and
616-
# explicit image tiling/token accounting in the Mcore wrapper.
617-
# We also keep the generation budget small for CI speed, and this evaluator
618-
# does not strip reasoning traces after </think> before scoring. If the model
619-
# ignores the non-thinking directive, answer extraction may see the reasoning.
620-
EXTRA_EVALUATOR_KWARGS = dict(
621-
apply_chat_template=True,
622-
is_multimodal=True,
623-
)
624-
625-
# NOTE: MMMU adds <|endoftext|> to the stop token.
626-
sampling_params = SamplingParams(
627-
max_tokens=MMMU.MAX_OUTPUT_LEN,
628-
truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
629-
stop="<|endoftext|>",
630-
temperature=0.0,
631-
top_k=1,
632-
)
633-
MMMU_TASK_SPEC = (MMMU, sampling_params, EXTRA_EVALUATOR_KWARGS)
634-
635-
voxpopuli_sampling_params = SamplingParams(
636-
max_tokens=512,
637-
truncate_prompt_tokens=VoxPopuli.MAX_INPUT_LEN,
638-
temperature=0.0,
639-
top_k=1,
640-
)
641-
no_thinking_evaluator_kwargs = {
642-
# We explicitly disable thinking, because otherwise the thinking traces could
643-
# be absurdly long (20k+ tokens), which is not helpful for test-runtime, nor
644-
# for reproducibility (the more tokens there are, the higher likelihood of the
645-
# end output not being the same).
646-
# In addition, if reasoning is cut off, then the WER goes through the roof,
647-
# since each word in the output is treated as an error.
648-
"chat_template_kwargs": {"enable_thinking": False},
649-
}
650-
VOXPOPULI_TASK_SPEC = (
651-
VoxPopuli,
652-
voxpopuli_sampling_params,
653-
no_thinking_evaluator_kwargs,
654-
)
655-
656-
videomme_sampling_params = SamplingParams(
657-
max_tokens=VideoMME.MAX_OUTPUT_LEN,
658-
truncate_prompt_tokens=VideoMME.MAX_INPUT_LEN,
659-
temperature=0.0,
660-
top_k=1,
661-
)
662-
VIDEOMME_TASK_SPEC = (
663-
VideoMME,
664-
videomme_sampling_params,
665-
no_thinking_evaluator_kwargs,
666-
)
667-
668669
@pytest.mark.skip_less_device_memory(80000)
669670
@pytest.mark.parametrize(
670671
(
@@ -747,6 +748,26 @@ class TestNanoV3Omni(LlmapiAccuracyTestHarness):
747748
marks=(skip_pre_blackwell,),
748749
id="nvfp4",
749750
),
751+
]
752+
+ [
753+
# TEMPORARY: duplicate the flaky NVFP4 case for B200 CI iteration.
754+
pytest.param(
755+
"nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
756+
f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
757+
KvCacheConfig(
758+
free_gpu_memory_fraction=0.8,
759+
mamba_ssm_cache_dtype="float32",
760+
enable_block_reuse=False,
761+
dtype="fp8",
762+
),
763+
64,
764+
QuantAlgo.MIXED_PRECISION,
765+
(MMMU_TASK_SPEC, VOXPOPULI_TASK_SPEC, VIDEOMME_TASK_SPEC),
766+
None,
767+
marks=(skip_pre_blackwell,),
768+
id=f"nvfp4_repeat_{i}",
769+
)
770+
for i in range(1, 11)
750771
],
751772
)
752773
# `torch.compile` uses a thread pool to compile and it's used in audio pre-processing.

0 commit comments

Comments
 (0)