Skip to content

Commit 5e1a98e

Browse files
authored
[https://nvbugs/5910749][https://nvbugs/5995486][test] Fix Qwen3 skip softmax attention CI tests (#12789)
Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com>
1 parent 07ba6d0 commit 5e1a98e

6 files changed

Lines changed: 42 additions & 21 deletions

File tree

tests/integration/defs/accuracy/references/longbench_v1.yaml

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,21 @@
11
Qwen3/Qwen3-30B-A3B-Instruct-2507:
2-
# Skip Softmax Attention ref accuracy
2+
# Skip Softmax Attention ref accuracy (BF16 KV cache)
33
- extra_acc_spec: "target_sparsity=0.0"
4-
accuracy: 47.357
4+
accuracy: 48.383
55
- extra_acc_spec: "target_sparsity=0.5"
6-
accuracy: 47.102
6+
accuracy: 46.708
77
- extra_acc_spec: "target_sparsity=0.9"
8-
accuracy: 46.169
8+
accuracy: 38.524
9+
# Skip Softmax Attention ref accuracy (FP8 KV cache)
10+
- kv_cache_quant_algo: FP8
11+
extra_acc_spec: "target_sparsity=0.0"
12+
accuracy: 47.650
13+
- kv_cache_quant_algo: FP8
14+
extra_acc_spec: "target_sparsity=0.5"
15+
accuracy: 46.559
16+
- kv_cache_quant_algo: FP8
17+
extra_acc_spec: "target_sparsity=0.9"
18+
accuracy: 38.632
919
deepseek-ai/DeepSeek-V3-0324:
1020
- quant_algo: NVFP4
1121
extra_acc_spec: "target_sparsity=0.9"

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4694,26 +4694,29 @@ class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness):
46944694
MODEL_PATH = f"{llm_models_root()}/{MODEL_NAME}"
46954695

46964696
@skip_pre_hopper
4697+
@parametrize_with_ids("fp8kv", [False, True])
46974698
@pytest.mark.parametrize(
46984699
"target_sparsity,thr_prefill,thr_decode",
46994700
[
47004701
(0.0, 0.0, 0.0),
4701-
(0.5, 85.97384174442398, 55.48258322852407),
4702-
(0.9, 1418.142868970396, 863.147841750025),
4702+
(0.5, 587.18, 16.52),
4703+
(0.9, 18471.56, 852.20),
47034704
],
47044705
ids=[
47054706
"target_sparsity_0.0", "target_sparsity_0.5", "target_sparsity_0.9"
47064707
],
47074708
)
47084709
def test_skip_softmax_attention(self, target_sparsity: float,
4709-
thr_prefill: float, thr_decode: float):
4710+
thr_prefill: float, thr_decode: float,
4711+
fp8kv: bool):
47104712
sparse_attention_config = SkipSoftmaxAttentionConfig(
47114713
threshold_scale_factor={
47124714
"prefill": thr_prefill,
47134715
"decode": thr_decode,
47144716
})
47154717
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75,
4716-
enable_block_reuse=False)
4718+
enable_block_reuse=False,
4719+
dtype="fp8" if fp8kv else "auto")
47174720

47184721
with LLM(self.MODEL_PATH,
47194722
attn_backend="TRTLLM",
@@ -4725,34 +4728,38 @@ def test_skip_softmax_attention(self, target_sparsity: float,
47254728
task.evaluate(llm,
47264729
extra_acc_spec=f"target_sparsity={target_sparsity}")
47274730

4731+
@skip_pre_hopper
4732+
@pytest.mark.skip_less_device(4)
4733+
@parametrize_with_ids("fp8kv", [False, True])
47284734
@pytest.mark.parametrize(
47294735
"target_sparsity,thr_prefill,thr_decode",
47304736
[
47314737
(0.0, 0.0, 0.0),
4732-
(0.5, 85.97384174442398, 55.48258322852407),
4733-
(0.9, 1418.142868970396, 863.147841750025),
4738+
(0.5, 587.18, 16.52),
4739+
(0.9, 18471.56, 852.20),
47344740
],
47354741
ids=[
47364742
"target_sparsity_0.0", "target_sparsity_0.5", "target_sparsity_0.9"
47374743
],
47384744
)
4739-
def test_skip_softmax_attention_2gpus(self, target_sparsity: float,
4740-
thr_prefill: float,
4741-
thr_decode: float):
4745+
def test_skip_softmax_attention_4gpus(self, target_sparsity: float,
4746+
thr_prefill: float, thr_decode: float,
4747+
fp8kv: bool):
47424748
sparse_attention_config = SkipSoftmaxAttentionConfig(
47434749
threshold_scale_factor={
47444750
"prefill": thr_prefill,
47454751
"decode": thr_decode,
47464752
})
47474753
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75,
4748-
enable_block_reuse=False)
4754+
enable_block_reuse=False,
4755+
dtype="fp8" if fp8kv else "auto")
47494756

47504757
with LLM(self.MODEL_PATH,
47514758
attn_backend="TRTLLM",
47524759
max_batch_size=256,
47534760
max_num_tokens=100000,
4754-
tensor_parallel_size=2,
4755-
moe_expert_parallel_size=2,
4761+
tensor_parallel_size=4,
4762+
moe_expert_parallel_size=4,
47564763
enable_attention_dp=True,
47574764
kv_cache_config=kv_cache_config,
47584765
sparse_attention_config=sparse_attention_config) as llm:

tests/integration/test_lists/test-db/l0_b200.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,7 @@ l0_b200:
7070
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
7171
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
7272
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
73-
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5]
74-
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9]
73+
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9-fp8kv=True]
7574
- accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_bf16
7675
- accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8
7776
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass]

tests/integration/test_lists/test-db/l0_dgx_b200.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,10 @@ l0_dgx_b200:
297297
- visual_gen/test_visual_gen_benchmark.py::test_online_benchmark[openai-videos]
298298
- examples/test_visual_gen.py::test_vbench_dimension_score_ltx2_bf16
299299
- examples/test_visual_gen.py::test_vbench_dimension_score_ltx2_fp8
300+
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.5-fp8kv=False]
301+
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.5-fp8kv=True]
302+
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.9-fp8kv=False]
303+
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.9-fp8kv=True]
300304
# ------------- AutoDeploy Backend Stages ---------------
301305
- condition:
302306
ranges:

tests/integration/test_lists/test-db/l0_dgx_h100.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,6 @@ l0_dgx_h100:
5454
- disaggregated/test_disaggregated.py::test_disaggregated_cancel_large_context_requests[DeepSeek-V3-Lite-bf16]
5555
# llmapi
5656
- unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks
57-
# ------------- Skip softmax attention tests ---------------
58-
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.5]
59-
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.9]
6057
- condition:
6158
ranges:
6259
system_gpu_count:

tests/integration/test_lists/test-db/l0_dgx_h200.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,10 @@ l0_dgx_h200:
134134
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
135135
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
136136
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8]
137+
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.5-fp8kv=False]
138+
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.5-fp8kv=True]
139+
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.9-fp8kv=False]
140+
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.9-fp8kv=True]
137141
- condition:
138142
ranges:
139143
system_gpu_count:

0 commit comments

Comments
 (0)