@@ -4694,26 +4694,29 @@ class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness):
46944694 MODEL_PATH = f"{ llm_models_root ()} /{ MODEL_NAME } "
46954695
46964696 @skip_pre_hopper
4697+ @parametrize_with_ids ("fp8kv" , [False , True ])
46974698 @pytest .mark .parametrize (
46984699 "target_sparsity,thr_prefill,thr_decode" ,
46994700 [
47004701 (0.0 , 0.0 , 0.0 ),
4701- (0.5 , 85.97384174442398 , 55.48258322852407 ),
4702- (0.9 , 1418.142868970396 , 863.147841750025 ),
4702+ (0.5 , 587.18 , 16.52 ),
4703+ (0.9 , 18471.56 , 852.20 ),
47034704 ],
47044705 ids = [
47054706 "target_sparsity_0.0" , "target_sparsity_0.5" , "target_sparsity_0.9"
47064707 ],
47074708 )
47084709 def test_skip_softmax_attention (self , target_sparsity : float ,
4709- thr_prefill : float , thr_decode : float ):
4710+ thr_prefill : float , thr_decode : float ,
4711+ fp8kv : bool ):
47104712 sparse_attention_config = SkipSoftmaxAttentionConfig (
47114713 threshold_scale_factor = {
47124714 "prefill" : thr_prefill ,
47134715 "decode" : thr_decode ,
47144716 })
47154717 kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.75 ,
4716- enable_block_reuse = False )
4718+ enable_block_reuse = False ,
4719+ dtype = "fp8" if fp8kv else "auto" )
47174720
47184721 with LLM (self .MODEL_PATH ,
47194722 attn_backend = "TRTLLM" ,
@@ -4725,34 +4728,38 @@ def test_skip_softmax_attention(self, target_sparsity: float,
47254728 task .evaluate (llm ,
47264729 extra_acc_spec = f"target_sparsity={ target_sparsity } " )
47274730
4731+ @skip_pre_hopper
4732+ @pytest .mark .skip_less_device (4 )
4733+ @parametrize_with_ids ("fp8kv" , [False , True ])
47284734 @pytest .mark .parametrize (
47294735 "target_sparsity,thr_prefill,thr_decode" ,
47304736 [
47314737 (0.0 , 0.0 , 0.0 ),
4732- (0.5 , 85.97384174442398 , 55.48258322852407 ),
4733- (0.9 , 1418.142868970396 , 863.147841750025 ),
4738+ (0.5 , 587.18 , 16.52 ),
4739+ (0.9 , 18471.56 , 852.20 ),
47344740 ],
47354741 ids = [
47364742 "target_sparsity_0.0" , "target_sparsity_0.5" , "target_sparsity_0.9"
47374743 ],
47384744 )
4739- def test_skip_softmax_attention_2gpus (self , target_sparsity : float ,
4740- thr_prefill : float ,
4741- thr_decode : float ):
4745+ def test_skip_softmax_attention_4gpus (self , target_sparsity : float ,
4746+ thr_prefill : float , thr_decode : float ,
4747+ fp8kv : bool ):
47424748 sparse_attention_config = SkipSoftmaxAttentionConfig (
47434749 threshold_scale_factor = {
47444750 "prefill" : thr_prefill ,
47454751 "decode" : thr_decode ,
47464752 })
47474753 kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.75 ,
4748- enable_block_reuse = False )
4754+ enable_block_reuse = False ,
4755+ dtype = "fp8" if fp8kv else "auto" )
47494756
47504757 with LLM (self .MODEL_PATH ,
47514758 attn_backend = "TRTLLM" ,
47524759 max_batch_size = 256 ,
47534760 max_num_tokens = 100000 ,
4754- tensor_parallel_size = 2 ,
4755- moe_expert_parallel_size = 2 ,
4761+ tensor_parallel_size = 4 ,
4762+ moe_expert_parallel_size = 4 ,
47564763 enable_attention_dp = True ,
47574764 kv_cache_config = kv_cache_config ,
47584765 sparse_attention_config = sparse_attention_config ) as llm :
0 commit comments