@@ -747,6 +747,87 @@ class TestNanoV3Omni(LlmapiAccuracyTestHarness):
747747 marks = (skip_pre_blackwell ,),
748748 id = "nvfp4" ,
749749 ),
750+ # TEMPORARY: duplicate the flaky NVFP4 case for B200 CI iteration.
751+ pytest .param (
752+ "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
753+ f"{ llm_models_root ()} /NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
754+ KvCacheConfig (
755+ free_gpu_memory_fraction = 0.8 ,
756+ mamba_ssm_cache_dtype = "float32" ,
757+ enable_block_reuse = False ,
758+ dtype = "fp8" ,
759+ ),
760+ 128 ,
761+ QuantAlgo .MIXED_PRECISION ,
762+ (MMMU_TASK_SPEC , VOXPOPULI_TASK_SPEC , VIDEOMME_TASK_SPEC ),
763+ None ,
764+ marks = (skip_pre_blackwell ,),
765+ id = "nvfp4_repeat1" ,
766+ ),
767+ pytest .param (
768+ "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
769+ f"{ llm_models_root ()} /NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
770+ KvCacheConfig (
771+ free_gpu_memory_fraction = 0.8 ,
772+ mamba_ssm_cache_dtype = "float32" ,
773+ enable_block_reuse = False ,
774+ dtype = "fp8" ,
775+ ),
776+ 128 ,
777+ QuantAlgo .MIXED_PRECISION ,
778+ (MMMU_TASK_SPEC , VOXPOPULI_TASK_SPEC , VIDEOMME_TASK_SPEC ),
779+ None ,
780+ marks = (skip_pre_blackwell ,),
781+ id = "nvfp4_repeat2" ,
782+ ),
783+ pytest .param (
784+ "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
785+ f"{ llm_models_root ()} /NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
786+ KvCacheConfig (
787+ free_gpu_memory_fraction = 0.8 ,
788+ mamba_ssm_cache_dtype = "float32" ,
789+ enable_block_reuse = False ,
790+ dtype = "fp8" ,
791+ ),
792+ 128 ,
793+ QuantAlgo .MIXED_PRECISION ,
794+ (MMMU_TASK_SPEC , VOXPOPULI_TASK_SPEC , VIDEOMME_TASK_SPEC ),
795+ None ,
796+ marks = (skip_pre_blackwell ,),
797+ id = "nvfp4_repeat3" ,
798+ ),
799+ pytest .param (
800+ "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
801+ f"{ llm_models_root ()} /NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
802+ KvCacheConfig (
803+ free_gpu_memory_fraction = 0.8 ,
804+ mamba_ssm_cache_dtype = "float32" ,
805+ enable_block_reuse = False ,
806+ dtype = "fp8" ,
807+ ),
808+ 128 ,
809+ QuantAlgo .MIXED_PRECISION ,
810+ (MMMU_TASK_SPEC , VOXPOPULI_TASK_SPEC , VIDEOMME_TASK_SPEC ),
811+ None ,
812+ marks = (skip_pre_blackwell ,),
813+ id = "nvfp4_repeat4" ,
814+ ),
815+ pytest .param (
816+ "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
817+ f"{ llm_models_root ()} /NVIDIA-Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4" ,
818+ KvCacheConfig (
819+ free_gpu_memory_fraction = 0.8 ,
820+ mamba_ssm_cache_dtype = "float32" ,
821+ enable_block_reuse = False ,
822+ dtype = "fp8" ,
823+ ),
824+ 128 ,
825+ QuantAlgo .MIXED_PRECISION ,
826+ (MMMU_TASK_SPEC , VOXPOPULI_TASK_SPEC , VIDEOMME_TASK_SPEC ),
827+ None ,
828+ marks = (skip_pre_blackwell ,),
829+ id = "nvfp4_repeat5" ,
830+ ),
750831 ],
751832 )
752833 # `torch.compile` uses a thread pool to compile and it's used in audio pre-processing.
0 commit comments