@@ -5795,15 +5795,11 @@ def test_bf16_4gpu(self, tp_size, pp_size, ep_size, cuda_graph,
57955795 "tp_size,pp_size,ep_size,cuda_graph,overlap_scheduler,attention_dp,enable_block_reuse" ,
57965796 [
57975797 (1 , 1 , 1 , True , True , False , True ),
5798- (1 , 1 , 1 , True , True , False , False ),
5799- (4 , 1 , 1 , True , True , False , False ),
58005798 (4 , 1 , 4 , True , True , True , False ),
5801- (4 , 1 , 4 , True , True , False , False ),
5802- (4 , 1 , 4 , False , False , False , False ),
58035799 ],
58045800 ids = [
5805- "tp1_block_reuse" , "tp1" , "tp4ep1" , "tp4ep4_adp_on" ,
5806- "tp4ep4_adp_off" , "no_cuda_graph_overlap"
5801+ "tp1_block_reuse" ,
5802+ "tp4ep4_adp_on" ,
58075803 ])
58085804 def test_nvfp4 (self , moe_backend , tp_size , pp_size , ep_size , cuda_graph ,
58095805 overlap_scheduler , attention_dp , enable_block_reuse , mocker ):
@@ -5820,7 +5816,7 @@ def test_nvfp4(self, moe_backend, tp_size, pp_size, ep_size, cuda_graph,
58205816 kv_cache_config .mamba_state_cache_interval = 256
58215817 pytorch_config = dict (disable_overlap_scheduler = not overlap_scheduler ,
58225818 cuda_graph_config = CudaGraphConfig (
5823- max_batch_size = 512 , enable_padding = False )
5819+ max_batch_size = 512 , enable_padding = True )
58245820 if cuda_graph else None )
58255821 moe_config = MoeConfig (backend = moe_backend )
58265822
@@ -5833,8 +5829,6 @@ def test_nvfp4(self, moe_backend, tp_size, pp_size, ep_size, cuda_graph,
58335829 enable_attention_dp = attention_dp ,
58345830 ** pytorch_config ,
58355831 moe_config = moe_config ) as llm :
5836- task = MMLU (self .MODEL_NAME )
5837- task .evaluate (llm )
58385832 mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" ,
58395833 self .GSM8K_MAX_OUTPUT_LEN )
58405834 task = GSM8K (self .MODEL_NAME )
0 commit comments