2323 MTPDecodingConfig ,
2424 SamplingParams ,
2525)
26- from tensorrt_llm .llmapi .llm_args import MultimodalConfig , MultimodalEncoderCudaGraphConfig
26+ from tensorrt_llm .llmapi .llm_args import MultimodalConfig
2727from tensorrt_llm .quantization import QuantAlgo
2828
2929from ..conftest import (
@@ -605,132 +605,75 @@ def test_auto_dtype(self, max_num_tokens):
605605 task .evaluate (llm , sampling_params = self .sampling_params )
606606
607607
608+ # The score here may be lower than VLMEvalKitMcore (official) runs. This path uses
609+ # lm_eval's MMMU task, prompt formatting, and scoring, while VLMEvalKitMcore
610+ # uses MMMU_DEV_VAL with its own MCQ prompt builder, answer extraction, and
611+ # explicit image tiling/token accounting in the Mcore wrapper.
612+ # We also keep the generation budget small for CI speed, and this evaluator
613+ # does not strip reasoning traces after </think> before scoring. If the model
614+ # ignores the non-thinking directive, answer extraction may see the reasoning.
615+ EXTRA_EVALUATOR_KWARGS = dict (
616+ apply_chat_template = True ,
617+ is_multimodal = True ,
618+ )
619+
620+ # NOTE: MMMU adds <|endoftext|> to the stop token.
621+ sampling_params = SamplingParams (
622+ max_tokens = MMMU .MAX_OUTPUT_LEN ,
623+ truncate_prompt_tokens = MMMU .MAX_INPUT_LEN ,
624+ stop = "<|endoftext|>" ,
625+ temperature = 0.0 ,
626+ top_k = 1 ,
627+ )
628+ MMMU_TASK_SPEC = (MMMU , sampling_params , EXTRA_EVALUATOR_KWARGS )
629+
630+ voxpopuli_sampling_params = SamplingParams (
631+ max_tokens = 512 ,
632+ truncate_prompt_tokens = VoxPopuli .MAX_INPUT_LEN ,
633+ temperature = 0.0 ,
634+ top_k = 1 ,
635+ )
636+ no_thinking_evaluator_kwargs = {
637+ # We explicitly disable thinking, because otherwise the thinking traces could
638+ # be absurdly long (20k+ tokens), which is not helpful for test-runtime, nor
639+ # for reproducibility (the more tokens there are, the higher likelihood of the
640+ # end output not being the same).
641+ # In addition, if reasoning is cut off, then the WER goes through the roof,
642+ # since each word in the output is treated as an error.
643+ "chat_template_kwargs" : {"enable_thinking" : False },
644+ }
645+ VOXPOPULI_TASK_SPEC = (
646+ VoxPopuli ,
647+ voxpopuli_sampling_params ,
648+ no_thinking_evaluator_kwargs ,
649+ )
650+
651+ videomme_sampling_params = SamplingParams (
652+ max_tokens = VideoMME .MAX_OUTPUT_LEN ,
653+ truncate_prompt_tokens = VideoMME .MAX_INPUT_LEN ,
654+ temperature = 0.0 ,
655+ top_k = 1 ,
656+ )
657+ VIDEOMME_TASK_SPEC = (
658+ VideoMME ,
659+ videomme_sampling_params ,
660+ no_thinking_evaluator_kwargs ,
661+ )
662+
663+
608664# Skip for B300 / GB300:
609665# * B300 coverage does not meaningfully extend what we test via B200.
610666# * GB300 may not be entirely up to date for `llm-models`, leading to repo-wide CI errors.
611667@skip_post_blackwell_ultra
612668class TestNanoV3Omni (LlmapiAccuracyTestHarness ):
613- # The score here may be lower than VLMEvalKitMcore (official) runs. This path uses
614- # lm_eval's MMMU task, prompt formatting, and scoring, while VLMEvalKitMcore
615- # uses MMMU_DEV_VAL with its own MCQ prompt builder, answer extraction, and
616- # explicit image tiling/token accounting in the Mcore wrapper.
617- # We also keep the generation budget small for CI speed, and this evaluator
618- # does not strip reasoning traces after </think> before scoring. If the model
619- # ignores the non-thinking directive, answer extraction may see the reasoning.
620- EXTRA_EVALUATOR_KWARGS = dict (
621- apply_chat_template = True ,
622- is_multimodal = True ,
623- )
624-
625- # NOTE: MMMU adds <|endoftext|> to the stop token.
626- sampling_params = SamplingParams (
627- max_tokens = MMMU .MAX_OUTPUT_LEN ,
628- truncate_prompt_tokens = MMMU .MAX_INPUT_LEN ,
629- stop = "<|endoftext|>" ,
630- temperature = 0.0 ,
631- top_k = 1 ,
632- )
633- MMMU_TASK_SPEC = (MMMU , sampling_params , EXTRA_EVALUATOR_KWARGS )
634-
635- voxpopuli_sampling_params = SamplingParams (
636- max_tokens = 512 ,
637- truncate_prompt_tokens = VoxPopuli .MAX_INPUT_LEN ,
638- temperature = 0.0 ,
639- top_k = 1 ,
640- )
641- no_thinking_evaluator_kwargs = {
642- # We explicitly disable thinking, because otherwise the thinking traces could
643- # be absurdly long (20k+ tokens), which is not helpful for test-runtime, nor
644- # for reproducibility (the more tokens there are, the higher likelihood of the
645- # end output not being the same).
646- # In addition, if reasoning is cut off, then the WER goes through the roof,
647- # since each word in the output is treated as an error.
648- "chat_template_kwargs" : {"enable_thinking" : False },
649- }
650- VOXPOPULI_TASK_SPEC = (
651- VoxPopuli ,
652- voxpopuli_sampling_params ,
653- no_thinking_evaluator_kwargs ,
654- )
655-
656- videomme_sampling_params = SamplingParams (
657- max_tokens = VideoMME .MAX_OUTPUT_LEN ,
658- truncate_prompt_tokens = VideoMME .MAX_INPUT_LEN ,
659- temperature = 0.0 ,
660- top_k = 1 ,
661- )
662- VIDEOMME_TASK_SPEC = (
663- VideoMME ,
664- videomme_sampling_params ,
665- no_thinking_evaluator_kwargs ,
666- )
667-
668669 @pytest .mark .skip_less_device_memory (80000 )
669670 @pytest .mark .parametrize (
670671 (
671672 "model_name,model_path,kv_cache_config,max_batch_size,"
672673 "expected_quant_algo,task_specs,multimodal_config"
673674 ),
674675 [
675- pytest .param (
676- "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16" ,
677- f"{ llm_models_root ()} /NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16" ,
678- KvCacheConfig (
679- free_gpu_memory_fraction = 0.8 ,
680- mamba_ssm_cache_dtype = "float32" ,
681- enable_block_reuse = False ,
682- ),
683- 32 ,
684- None ,
685- (MMMU_TASK_SPEC ,),
686- None ,
687- id = "bf16" ,
688- ),
689- pytest .param (
690- "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8" ,
691- f"{ llm_models_root ()} /NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8" ,
692- KvCacheConfig (
693- free_gpu_memory_fraction = 0.8 ,
694- mamba_ssm_cache_dtype = "float32" ,
695- enable_block_reuse = False ,
696- dtype = "fp8" ,
697- ),
698- 64 ,
699- QuantAlgo .FP8 ,
700- (MMMU_TASK_SPEC , VOXPOPULI_TASK_SPEC , VIDEOMME_TASK_SPEC ),
701- None ,
702- marks = skip_pre_hopper ,
703- id = "fp8" ,
704- ),
705- pytest .param (
706- "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8" ,
707- f"{ llm_models_root ()} /NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-FP8" ,
708- KvCacheConfig (
709- free_gpu_memory_fraction = 0.8 ,
710- mamba_ssm_cache_dtype = "float32" ,
711- enable_block_reuse = False ,
712- dtype = "fp8" ,
713- ),
714- 64 ,
715- QuantAlgo .FP8 ,
716- (MMMU_TASK_SPEC ,),
717- MultimodalConfig (
718- encoder_cuda_graph = {
719- "vision" : MultimodalEncoderCudaGraphConfig (
720- # Uncomment to debug (logs will show hits / misses), which is how the
721- # below buckets were determined.
722- # enable_replay_stats=True,
723- buckets = [
724- (1280 , 1 ),
725- (4096 , 1 ),
726- (5500 , 2 ),
727- ],
728- )
729- },
730- ),
731- marks = skip_pre_hopper ,
732- id = "fp8_mmmu_encoder_cuda_graph" ,
733- ),
676+ # TEMPORARY: duplicate the flaky NVFP4 case for B200 CI iteration.
734677 pytest .param (
735678 "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
736679 f"{ llm_models_root ()} /NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
@@ -740,13 +683,14 @@ class TestNanoV3Omni(LlmapiAccuracyTestHarness):
740683 enable_block_reuse = False ,
741684 dtype = "fp8" ,
742685 ),
743- 128 ,
686+ 64 ,
744687 QuantAlgo .MIXED_PRECISION ,
745688 (MMMU_TASK_SPEC , VOXPOPULI_TASK_SPEC , VIDEOMME_TASK_SPEC ),
746689 None ,
747690 marks = (skip_pre_blackwell ,),
748- id = "nvfp4" ,
749- ),
691+ id = f"nvfp4_repeat_{ i } " ,
692+ )
693+ for i in range (1 , 11 )
750694 ],
751695 )
752696 # `torch.compile` uses a thread pool to compile and it's used in audio pre-processing.
0 commit comments