@@ -747,6 +747,82 @@ class TestNanoV3Omni(LlmapiAccuracyTestHarness):
747747 marks = (skip_pre_blackwell ,),
748748 id = "nvfp4" ,
749749 ),
750+ # TEMPORARY: duplicate the flaky NVFP4 case for B200 CI iteration.
751+ pytest .param (
752+ "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
753+ f"{ llm_models_root ()} /NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
754+ KvCacheConfig (
755+ free_gpu_memory_fraction = 0.8 ,
756+ mamba_ssm_cache_dtype = "float32" ,
757+ enable_block_reuse = False ,
758+ dtype = "fp8" ,
759+ ),
760+ 128 ,
761+ QuantAlgo .MIXED_PRECISION ,
762+ (MMMU_TASK_SPEC , VOXPOPULI_TASK_SPEC , VIDEOMME_TASK_SPEC ),
763+ marks = (skip_pre_blackwell ,),
764+ id = "nvfp4_repeat1" ,
765+ ),
766+ pytest .param (
767+ "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
768+ f"{ llm_models_root ()} /NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
769+ KvCacheConfig (
770+ free_gpu_memory_fraction = 0.8 ,
771+ mamba_ssm_cache_dtype = "float32" ,
772+ enable_block_reuse = False ,
773+ dtype = "fp8" ,
774+ ),
775+ 128 ,
776+ QuantAlgo .MIXED_PRECISION ,
777+ (MMMU_TASK_SPEC , VOXPOPULI_TASK_SPEC , VIDEOMME_TASK_SPEC ),
778+ marks = (skip_pre_blackwell ,),
779+ id = "nvfp4_repeat2" ,
780+ ),
781+ pytest .param (
782+ "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
783+ f"{ llm_models_root ()} /NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
784+ KvCacheConfig (
785+ free_gpu_memory_fraction = 0.8 ,
786+ mamba_ssm_cache_dtype = "float32" ,
787+ enable_block_reuse = False ,
788+ dtype = "fp8" ,
789+ ),
790+ 128 ,
791+ QuantAlgo .MIXED_PRECISION ,
792+ (MMMU_TASK_SPEC , VOXPOPULI_TASK_SPEC , VIDEOMME_TASK_SPEC ),
793+ marks = (skip_pre_blackwell ,),
794+ id = "nvfp4_repeat3" ,
795+ ),
796+ pytest .param (
797+ "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
798+ f"{ llm_models_root ()} /NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
799+ KvCacheConfig (
800+ free_gpu_memory_fraction = 0.8 ,
801+ mamba_ssm_cache_dtype = "float32" ,
802+ enable_block_reuse = False ,
803+ dtype = "fp8" ,
804+ ),
805+ 128 ,
806+ QuantAlgo .MIXED_PRECISION ,
807+ (MMMU_TASK_SPEC , VOXPOPULI_TASK_SPEC , VIDEOMME_TASK_SPEC ),
808+ marks = (skip_pre_blackwell ,),
809+ id = "nvfp4_repeat4" ,
810+ ),
811+ pytest .param (
812+ "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
813+ f"{ llm_models_root ()} /NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
814+ KvCacheConfig (
815+ free_gpu_memory_fraction = 0.8 ,
816+ mamba_ssm_cache_dtype = "float32" ,
817+ enable_block_reuse = False ,
818+ dtype = "fp8" ,
819+ ),
820+ 128 ,
821+ QuantAlgo .MIXED_PRECISION ,
822+ (MMMU_TASK_SPEC , VOXPOPULI_TASK_SPEC , VIDEOMME_TASK_SPEC ),
823+ marks = (skip_pre_blackwell ,),
824+ id = "nvfp4_repeat5" ,
825+ ),
750826 ],
751827 )
752828 # `torch.compile` uses a thread pool to compile and it's used in audio pre-processing.
0 commit comments