@@ -711,6 +711,82 @@ class TestNanoV3Omni(LlmapiAccuracyTestHarness):
711711 marks = (skip_pre_blackwell ,),
712712 id = "nvfp4" ,
713713 ),
714+ # TEMPORARY: duplicate the flaky NVFP4 case for B200 CI iteration.
715+ pytest .param (
716+ "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
717+ f"{ llm_models_root ()} /NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
718+ KvCacheConfig (
719+ free_gpu_memory_fraction = 0.8 ,
720+ mamba_ssm_cache_dtype = "float32" ,
721+ enable_block_reuse = False ,
722+ dtype = "fp8" ,
723+ ),
724+ 128 ,
725+ QuantAlgo .MIXED_PRECISION ,
726+ (MMMU_TASK_SPEC , VOXPOPULI_TASK_SPEC , VIDEOMME_TASK_SPEC ),
727+ marks = (skip_pre_blackwell ,),
728+ id = "nvfp4_repeat1" ,
729+ ),
730+ pytest .param (
731+ "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
732+ f"{ llm_models_root ()} /NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
733+ KvCacheConfig (
734+ free_gpu_memory_fraction = 0.8 ,
735+ mamba_ssm_cache_dtype = "float32" ,
736+ enable_block_reuse = False ,
737+ dtype = "fp8" ,
738+ ),
739+ 128 ,
740+ QuantAlgo .MIXED_PRECISION ,
741+ (MMMU_TASK_SPEC , VOXPOPULI_TASK_SPEC , VIDEOMME_TASK_SPEC ),
742+ marks = (skip_pre_blackwell ,),
743+ id = "nvfp4_repeat2" ,
744+ ),
745+ pytest .param (
746+ "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
747+ f"{ llm_models_root ()} /NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
748+ KvCacheConfig (
749+ free_gpu_memory_fraction = 0.8 ,
750+ mamba_ssm_cache_dtype = "float32" ,
751+ enable_block_reuse = False ,
752+ dtype = "fp8" ,
753+ ),
754+ 128 ,
755+ QuantAlgo .MIXED_PRECISION ,
756+ (MMMU_TASK_SPEC , VOXPOPULI_TASK_SPEC , VIDEOMME_TASK_SPEC ),
757+ marks = (skip_pre_blackwell ,),
758+ id = "nvfp4_repeat3" ,
759+ ),
760+ pytest .param (
761+ "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
762+ f"{ llm_models_root ()} /NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
763+ KvCacheConfig (
764+ free_gpu_memory_fraction = 0.8 ,
765+ mamba_ssm_cache_dtype = "float32" ,
766+ enable_block_reuse = False ,
767+ dtype = "fp8" ,
768+ ),
769+ 128 ,
770+ QuantAlgo .MIXED_PRECISION ,
771+ (MMMU_TASK_SPEC , VOXPOPULI_TASK_SPEC , VIDEOMME_TASK_SPEC ),
772+ marks = (skip_pre_blackwell ,),
773+ id = "nvfp4_repeat4" ,
774+ ),
775+ pytest .param (
776+ "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
777+ f"{ llm_models_root ()} /NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
778+ KvCacheConfig (
779+ free_gpu_memory_fraction = 0.8 ,
780+ mamba_ssm_cache_dtype = "float32" ,
781+ enable_block_reuse = False ,
782+ dtype = "fp8" ,
783+ ),
784+ 128 ,
785+ QuantAlgo .MIXED_PRECISION ,
786+ (MMMU_TASK_SPEC , VOXPOPULI_TASK_SPEC , VIDEOMME_TASK_SPEC ),
787+ marks = (skip_pre_blackwell ,),
788+ id = "nvfp4_repeat5" ,
789+ ),
714790 ],
715791 )
716792 # `torch.compile` uses a thread pool to compile and it's used in audio pre-processing.
0 commit comments