Skip to content

Commit ccc0708

Browse files
authored
[TRTLLM-12154][test] Add Qwen3-32B FP8 disagg stress test (#14278)
Signed-off-by: Brian Nguyen <brnguyen@nvidia.com>
1 parent 00ed78c commit ccc0708

6 files changed

Lines changed: 110 additions & 4 deletions

File tree

tests/integration/defs/common.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,15 @@ def _war_check_output(*args, **kwargs):
5656
return venv.run_cmd(cmd, caller=_war_check_output, env=env, **kwargs)
5757

5858

59+
def resolve_llm_model_path(model_path: str) -> str:
60+
"""Resolve a model subpath relative to the test LLM model root."""
61+
if os.path.isabs(model_path):
62+
return model_path
63+
64+
from .conftest import llm_models_root
65+
return os.path.join(llm_models_root(), model_path)
66+
67+
5968
def venv_mpi_check_call(venv, mpi_cmd, python_cmd, **kwargs):
6069
"""
6170
This function WAR check_call() to run python_cmd with mpi.
@@ -1287,9 +1296,11 @@ def parse_gsm8k_output(output_text: str) -> float:
12871296
float: The accuracy value (0.7582 in the example)
12881297
"""
12891298

1290-
# Look for the specific pattern: |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.7559|± |0.0118|
1299+
# Look for the specific pattern:
1300+
# |gsm8k|...|flexible-extract| 5|exact_match|↑ |0.7559|± |0.0118|
1301+
# lm-eval pads table cells, so allow whitespace around the value.
12911302
patterns = [
1292-
r'flexible-extract\|\s+\d+\|exact_match\|\↑\s+\|(\d+\.\d+)',
1303+
r'flexible-extract\s*\|\s*\d+\s*\|\s*exact_match\s*\|\s*↑\s*\|\s*(\d+(?:\.\d+)?)',
12931304
]
12941305

12951306
for pattern in patterns:
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
hostname: localhost
2+
model: Qwen3/Qwen3-32B-FP8
3+
backend: pytorch
4+
cuda_graph_config: null
5+
guided_decoding_backend: xgrammar
6+
# speculative_config goes top-level so both context and generation workers
7+
# see it. Putting it under generation_servers: only causes the disagg KV
8+
# transceiver to reject the cacheState handshake (cacheStates differ when
9+
# only one side has spec config).
10+
speculative_config:
11+
decoding_type: Eagle3
12+
max_draft_len: 3
13+
speculative_model: Zhi-Create-Qwen3-32B-Eagle3
14+
context_servers:
15+
num_instances: 4
16+
tensor_parallel_size: 1
17+
pipeline_parallel_size: 1
18+
router:
19+
type: kv_cache_aware
20+
enable_chunked_prefill: true
21+
max_num_tokens: 4096
22+
max_seq_len: 10240
23+
max_batch_size: 128
24+
disable_overlap_scheduler: true
25+
print_iter_log: true
26+
kv_cache_config:
27+
enable_block_reuse: true
28+
enable_partial_reuse: true
29+
dtype: fp8
30+
free_gpu_memory_fraction: 0.8
31+
event_buffer_max_size: 1024
32+
cache_transceiver_config:
33+
backend: DEFAULT
34+
max_tokens_in_buffer: 16384
35+
generation_servers:
36+
num_instances: 1
37+
tensor_parallel_size: 4
38+
pipeline_parallel_size: 1
39+
enable_chunked_prefill: true
40+
max_num_tokens: 4096
41+
max_seq_len: 10240
42+
max_batch_size: 128
43+
# Eagle3 requires the overlap scheduler off. Cache settings must match the
44+
# context block or disagg KV transfer rejects the cacheState handshake.
45+
disable_overlap_scheduler: true
46+
print_iter_log: true
47+
kv_cache_config:
48+
enable_block_reuse: true
49+
enable_partial_reuse: true
50+
dtype: fp8
51+
free_gpu_memory_fraction: 0.8
52+
cache_transceiver_config:
53+
backend: DEFAULT
54+
max_tokens_in_buffer: 16384

tests/integration/defs/disaggregated/test_disaggregated.py

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@
3131
import pytest
3232
import yaml
3333
from defs.common import get_free_port_in_ci as get_free_port
34-
from defs.common import parse_gsm8k_output, wait_for_server
34+
from defs.common import (parse_gsm8k_output, resolve_llm_model_path,
35+
wait_for_server)
3536
from defs.conftest import (get_sm_version, llm_models_root, skip_arm,
3637
skip_no_hopper, skip_pre_blackwell, skip_pre_hopper)
3738
from defs.trt_test_alternative import check_call, check_output, print_info
@@ -308,6 +309,8 @@ def get_test_config(test_desc, example_dir, test_root):
308309
f"{test_configs_root}/disagg_config_ctxtp2_gentp2_gptoss_triton.yaml",
309310
"qwen3_5_4b_fp8_stress":
310311
f"{test_configs_root}/disagg_config_ctxtp1_gentp1_qwen3_5_4b_fp8_tllm.yaml",
312+
"qwen3_32b_fp8_stress":
313+
f"{test_configs_root}/disagg_config_ctxtp1_gentp4_qwen3_32b_fp8.yaml",
311314
"gpt_oss_120b_harmony":
312315
f"{test_configs_root}/disagg_config_ctxtp2_gentp2_gptoss_tllm.yaml",
313316
"cancel_stress_test":
@@ -492,6 +495,11 @@ def run_client_tests(example_dir,
492495
"The capital of Germany is Berlin",
493496
"Using `asyncio` in Python"
494497
]
498+
elif "qwen3_32b_fp8" in test_desc:
499+
expected_strings = [
500+
"The capital of Germany is Berlin",
501+
"Asyncio in Python is a library"
502+
]
495503
else:
496504
expected_strings = [
497505
"The capital of Germany is Berlin",
@@ -618,6 +626,13 @@ def setup_disagg_cluster(
618626
with open(config_file, 'r') as f:
619627
config = yaml.safe_load(f)
620628

629+
speculative_config = config.get("speculative_config")
630+
if isinstance(speculative_config, dict):
631+
speculative_model = speculative_config.get("speculative_model")
632+
if speculative_model:
633+
speculative_config["speculative_model"] = resolve_llm_model_path(
634+
speculative_model)
635+
621636
disagg_cluster = get_default_disagg_cluster_config()
622637
server_host = config.get("hostname", "localhost")
623638
server_port = get_free_port()
@@ -648,6 +663,8 @@ def setup_disagg_cluster(
648663

649664
# Launch workers
650665
model = model_name or config.get("model")
666+
if model:
667+
model = resolve_llm_model_path(model)
651668
ctx_workers = []
652669
gen_workers = []
653670
disagg_server = None
@@ -2288,6 +2305,22 @@ def test_disaggregated_gpt_oss_120b_harmony(disaggregated_test_root,
22882305
cwd=llm_venv.get_working_directory())
22892306

22902307

2308+
@skip_pre_hopper
2309+
@pytest.mark.skip_less_device(8)
2310+
@pytest.mark.parametrize("model_path", ['Qwen3/Qwen3-32B-FP8'])
2311+
def test_disaggregated_qwen3_32b_fp8(disaggregated_test_root,
2312+
disaggregated_example_root, llm_venv,
2313+
model_path):
2314+
model_dir = resolve_llm_model_path(model_path)
2315+
setup_model_symlink(llm_venv, model_dir, model_path)
2316+
2317+
run_disaggregated_test(disaggregated_example_root,
2318+
"qwen3_32b_fp8_stress",
2319+
env=llm_venv._new_env,
2320+
model_path=model_dir,
2321+
cwd=llm_venv.get_working_directory())
2322+
2323+
22912324
@pytest.mark.timeout(12600)
22922325
@pytest.mark.parametrize("test_config", [
22932326
pytest.param(TestConfig(model_path='DeepSeek-R1/DeepSeek-R1-0528-FP4-v2',
@@ -2349,6 +2382,11 @@ def test_disaggregated_gpt_oss_120b_harmony(disaggregated_test_root,
23492382
cancellation_rate=10,
23502383
cancellation_delay=0.5),
23512384
marks=(pytest.mark.skip_less_device(2), skip_no_hopper)),
2385+
pytest.param(TestConfig(model_path='Qwen3/Qwen3-32B-FP8',
2386+
test_desc='qwen3_32b_fp8_stress',
2387+
request_count=10000,
2388+
accuracy_threshold=0.42),
2389+
marks=(pytest.mark.skip_less_device(8), skip_pre_hopper)),
23522390
],
23532391
ids=lambda x: x.test_desc)
23542392
@pytest.mark.parametrize("concurrency", [512], ids=lambda x: f"conc{x}")
@@ -2363,7 +2401,7 @@ def test_disaggregated_stress_test(disaggregated_test_root,
23632401
# Unpack configuration from dataclass
23642402
model_path = test_config.model_path
23652403
test_desc = test_config.test_desc
2366-
model_dir = f"{llm_models_root()}/{model_path}"
2404+
model_dir = resolve_llm_model_path(model_path)
23672405
setup_model_symlink(llm_venv, model_dir, model_path)
23682406

23692407
config_file = get_test_config(test_desc, disaggregated_example_root,

tests/integration/test_lists/qa/llm_function_stress.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-outp
55
disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-gpt_oss_120b_eagle_trtllm_stress]
66
disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-gpt_oss_120b_triton_stress]
77
disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-qwen3_5_4b_fp8_stress]
8+
disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-qwen3_32b_fp8_stress]
89
stress_test/disagg_cancel/test_disagg_cancel_stress.py::test_disagg_cancellation_marathon[marathon_cpp_v1_deepseek.yaml] TIMEOUT (45)
910
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_fp8_8gpus
1011
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_nvfp4_4gpus

tests/integration/test_lists/test-db/l0_dgx_h200.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ l0_dgx_h200:
4040
- disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0]
4141
- disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[TinyLlama-1.1B-Chat-v1.0]
4242
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ctxtp2ep2pp2_gentp4_one_mtp_block_reuse[DeepSeek-V3-Lite-fp8]
43+
- disaggregated/test_disaggregated.py::test_disaggregated_qwen3_32b_fp8[Qwen3/Qwen3-32B-FP8]
4344
- unittest/llmapi/test_llm_pytorch.py::test_nemotron_nas_lora
4445
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_gen_only_spec_dec
4546
- condition:

tests/integration/test_lists/waives.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten
8686
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.5-fp8kv=False] SKIP (https://nvbugs/6260915)
8787
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.5-fp8kv=True] SKIP (https://nvbugs/6248783)
8888
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.9-fp8kv=False] SKIP (https://nvbugs/6260915)
89+
accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_bf16_mtp[mtp_on] SKIP (https://nvbugs/6206179)
8990
accuracy/test_llm_api_pytorch.py::TestQwen3_5_397B_A17B::test_nvfp4[tep4_cutedsl] SKIP (https://nvbugs/6255417)
9091
accuracy/test_llm_api_pytorch.py::TestQwen3_5_4B::test_bf16 SKIP (https://nvbugs/6283537)
9192
accuracy/test_llm_api_pytorch.py::TestQwen3_5_9B::test_bf16[mtp_off] SKIP (https://nvbugs/6212250)

0 commit comments

Comments
 (0)