@@ -605,66 +605,67 @@ def test_auto_dtype(self, max_num_tokens):
605605 task .evaluate (llm , sampling_params = self .sampling_params )
606606
607607
608+ # The score here may be lower than VLMEvalKitMcore (official) runs. This path uses
609+ # lm_eval's MMMU task, prompt formatting, and scoring, while VLMEvalKitMcore
610+ # uses MMMU_DEV_VAL with its own MCQ prompt builder, answer extraction, and
611+ # explicit image tiling/token accounting in the Mcore wrapper.
612+ # We also keep the generation budget small for CI speed, and this evaluator
613+ # does not strip reasoning traces after </think> before scoring. If the model
614+ # ignores the non-thinking directive, answer extraction may see the reasoning.
615+ EXTRA_EVALUATOR_KWARGS = dict (
616+ apply_chat_template = True ,
617+ is_multimodal = True ,
618+ )
619+
620+ # NOTE: MMMU adds <|endoftext|> to the stop token.
621+ sampling_params = SamplingParams (
622+ max_tokens = MMMU .MAX_OUTPUT_LEN ,
623+ truncate_prompt_tokens = MMMU .MAX_INPUT_LEN ,
624+ stop = "<|endoftext|>" ,
625+ temperature = 0.0 ,
626+ top_k = 1 ,
627+ )
628+ MMMU_TASK_SPEC = (MMMU , sampling_params , EXTRA_EVALUATOR_KWARGS )
629+
630+ voxpopuli_sampling_params = SamplingParams (
631+ max_tokens = 512 ,
632+ truncate_prompt_tokens = VoxPopuli .MAX_INPUT_LEN ,
633+ temperature = 0.0 ,
634+ top_k = 1 ,
635+ )
636+ no_thinking_evaluator_kwargs = {
637+ # We explicitly disable thinking, because otherwise the thinking traces could
638+ # be absurdly long (20k+ tokens), which is not helpful for test-runtime, nor
639+ # for reproducibility (the more tokens there are, the higher likelihood of the
640+ # end output not being the same).
641+ # In addition, if reasoning is cut off, then the WER goes through the roof,
642+ # since each word in the output is treated as an error.
643+ "chat_template_kwargs" : {"enable_thinking" : False },
644+ }
645+ VOXPOPULI_TASK_SPEC = (
646+ VoxPopuli ,
647+ voxpopuli_sampling_params ,
648+ no_thinking_evaluator_kwargs ,
649+ )
650+
651+ videomme_sampling_params = SamplingParams (
652+ max_tokens = VideoMME .MAX_OUTPUT_LEN ,
653+ truncate_prompt_tokens = VideoMME .MAX_INPUT_LEN ,
654+ temperature = 0.0 ,
655+ top_k = 1 ,
656+ )
657+ VIDEOMME_TASK_SPEC = (
658+ VideoMME ,
659+ videomme_sampling_params ,
660+ no_thinking_evaluator_kwargs ,
661+ )
662+
663+
608664# Skip for B300 / GB300:
609665# * B300 coverage does not meaningfully extend what we test via B200.
610666# * GB300 may not be entirely up to date for `llm-models`, leading to repo-wide CI errors.
611667@skip_post_blackwell_ultra
612668class TestNanoV3Omni (LlmapiAccuracyTestHarness ):
613- # The score here may be lower than VLMEvalKitMcore (official) runs. This path uses
614- # lm_eval's MMMU task, prompt formatting, and scoring, while VLMEvalKitMcore
615- # uses MMMU_DEV_VAL with its own MCQ prompt builder, answer extraction, and
616- # explicit image tiling/token accounting in the Mcore wrapper.
617- # We also keep the generation budget small for CI speed, and this evaluator
618- # does not strip reasoning traces after </think> before scoring. If the model
619- # ignores the non-thinking directive, answer extraction may see the reasoning.
620- EXTRA_EVALUATOR_KWARGS = dict (
621- apply_chat_template = True ,
622- is_multimodal = True ,
623- )
624-
625- # NOTE: MMMU adds <|endoftext|> to the stop token.
626- sampling_params = SamplingParams (
627- max_tokens = MMMU .MAX_OUTPUT_LEN ,
628- truncate_prompt_tokens = MMMU .MAX_INPUT_LEN ,
629- stop = "<|endoftext|>" ,
630- temperature = 0.0 ,
631- top_k = 1 ,
632- )
633- MMMU_TASK_SPEC = (MMMU , sampling_params , EXTRA_EVALUATOR_KWARGS )
634-
635- voxpopuli_sampling_params = SamplingParams (
636- max_tokens = 512 ,
637- truncate_prompt_tokens = VoxPopuli .MAX_INPUT_LEN ,
638- temperature = 0.0 ,
639- top_k = 1 ,
640- )
641- no_thinking_evaluator_kwargs = {
642- # We explicitly disable thinking, because otherwise the thinking traces could
643- # be absurdly long (20k+ tokens), which is not helpful for test-runtime, nor
644- # for reproducibility (the more tokens there are, the higher likelihood of the
645- # end output not being the same).
646- # In addition, if reasoning is cut off, then the WER goes through the roof,
647- # since each word in the output is treated as an error.
648- "chat_template_kwargs" : {"enable_thinking" : False },
649- }
650- VOXPOPULI_TASK_SPEC = (
651- VoxPopuli ,
652- voxpopuli_sampling_params ,
653- no_thinking_evaluator_kwargs ,
654- )
655-
656- videomme_sampling_params = SamplingParams (
657- max_tokens = VideoMME .MAX_OUTPUT_LEN ,
658- truncate_prompt_tokens = VideoMME .MAX_INPUT_LEN ,
659- temperature = 0.0 ,
660- top_k = 1 ,
661- )
662- VIDEOMME_TASK_SPEC = (
663- VideoMME ,
664- videomme_sampling_params ,
665- no_thinking_evaluator_kwargs ,
666- )
667-
668669 @pytest .mark .skip_less_device_memory (80000 )
669670 @pytest .mark .parametrize (
670671 (
@@ -747,6 +748,26 @@ class TestNanoV3Omni(LlmapiAccuracyTestHarness):
747748 marks = (skip_pre_blackwell ,),
748749 id = "nvfp4" ,
749750 ),
751+ ]
752+ + [
753+ # TEMPORARY: duplicate the flaky NVFP4 case for B200 CI iteration.
754+ pytest .param (
755+ "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
756+ f"{ llm_models_root ()} /NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
757+ KvCacheConfig (
758+ free_gpu_memory_fraction = 0.8 ,
759+ mamba_ssm_cache_dtype = "float32" ,
760+ enable_block_reuse = False ,
761+ dtype = "fp8" ,
762+ ),
763+ 64 ,
764+ QuantAlgo .MIXED_PRECISION ,
765+ (MMMU_TASK_SPEC , VOXPOPULI_TASK_SPEC , VIDEOMME_TASK_SPEC ),
766+ None ,
767+ marks = (skip_pre_blackwell ,),
768+ id = f"nvfp4_repeat_{ i } " ,
769+ )
770+ for i in range (1 , 11 )
750771 ],
751772 )
752773 # `torch.compile` uses a thread pool to compile and it's used in audio pre-processing.
0 commit comments