Skip to content

Commit 2c22fef

Browse files
committed
[TRTLLM-12154][test] Add Qwen3-32B FP8 Eagle3 disagg stress test
Add a Qwen3-32B FP8 disaggregated serving smoke and stress test that exercises Eagle3 with 4x TP1 context workers and 1x TP4 generation worker on 8 GPUs. The YAML enables FP8 KV cache, chunked prefill, block and partial reuse, cache transfer, and a top-level Eagle3 speculative_config shared by context and generation workers. The draft model is stored as a model-root-relative path and the disagg harness now resolves relative model and speculative_model values through llm_models_root while preserving absolute paths. Wire the smoke test into the H200 L0 list and the full 10k-request stress case into the QA stress list. Add Qwen-specific output substring checks and keep the stress accuracy threshold aligned with the adjacent GPT-OSS stress case. Signed-off-by: Brian Nguyen <brnguyen@nvidia.com>
1 parent f406f6e commit 2c22fef

5 files changed

Lines changed: 105 additions & 3 deletions

File tree

tests/integration/defs/common.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1287,9 +1287,11 @@ def parse_gsm8k_output(output_text: str) -> float:
12871287
float: The accuracy value (0.7582 in the example)
12881288
"""
12891289

1290-
# Look for the specific pattern: |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.7559|± |0.0118|
1290+
# Look for the specific pattern:
1291+
# |gsm8k|...|flexible-extract| 5|exact_match|↑ |0.7559|± |0.0118|
1292+
# lm-eval pads table cells, so allow whitespace around the value.
12911293
patterns = [
1292-
r'flexible-extract\|\s+\d+\|exact_match\|\↑\s+\|(\d+\.\d+)',
1294+
r'flexible-extract\s*\|\s*\d+\s*\|\s*exact_match\s*\|\s*↑\s*\|\s*(\d+(?:\.\d+)?)',
12931295
]
12941296

12951297
for pattern in patterns:
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
hostname: localhost
2+
model: Qwen3/Qwen3-32B-FP8
3+
backend: pytorch
4+
cuda_graph_config: null
5+
guided_decoding_backend: xgrammar
6+
# speculative_config goes top-level so both context and generation workers
7+
# see it. Putting it under generation_servers: only causes the disagg KV
8+
# transceiver to reject the cacheState handshake (cacheStates differ when
9+
# only one side has spec config).
10+
speculative_config:
11+
decoding_type: Eagle3
12+
max_draft_len: 3
13+
speculative_model: Zhi-Create-Qwen3-32B-Eagle3
14+
context_servers:
15+
num_instances: 4
16+
tensor_parallel_size: 1
17+
pipeline_parallel_size: 1
18+
router:
19+
type: kv_cache_aware
20+
enable_chunked_prefill: true
21+
max_num_tokens: 4096
22+
max_seq_len: 10240
23+
max_batch_size: 128
24+
disable_overlap_scheduler: true
25+
print_iter_log: true
26+
kv_cache_config:
27+
enable_block_reuse: true
28+
enable_partial_reuse: true
29+
dtype: fp8
30+
free_gpu_memory_fraction: 0.8
31+
event_buffer_max_size: 1024
32+
cache_transceiver_config:
33+
backend: DEFAULT
34+
max_tokens_in_buffer: 16384
35+
generation_servers:
36+
num_instances: 1
37+
tensor_parallel_size: 4
38+
pipeline_parallel_size: 1
39+
enable_chunked_prefill: true
40+
max_num_tokens: 4096
41+
max_seq_len: 10240
42+
max_batch_size: 128
43+
# Eagle3 requires the overlap scheduler off. Cache settings must match the
44+
# context block or disagg KV transfer rejects the cacheState handshake.
45+
disable_overlap_scheduler: true
46+
print_iter_log: true
47+
kv_cache_config:
48+
enable_block_reuse: true
49+
enable_partial_reuse: true
50+
dtype: fp8
51+
free_gpu_memory_fraction: 0.8
52+
cache_transceiver_config:
53+
backend: DEFAULT
54+
max_tokens_in_buffer: 16384

tests/integration/defs/disaggregated/test_disaggregated.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,8 @@ def get_test_config(test_desc, example_dir, test_root):
261261
f"{test_configs_root}/disagg_config_ctxtp4_gentp4_deepseek_r1_v2_fp4_tllm.yaml",
262262
"gpt_oss_120b_stress":
263263
f"{test_configs_root}/disagg_config_ctxtp2_gentp2_gptoss_tllm.yaml",
264+
"qwen3_32b_fp8_stress":
265+
f"{test_configs_root}/disagg_config_ctxtp1_gentp4_qwen3_32b_fp8.yaml",
264266
"gpt_oss_120b_harmony":
265267
f"{test_configs_root}/disagg_config_ctxtp2_gentp2_gptoss_tllm.yaml",
266268
"cancel_stress_test":
@@ -294,6 +296,13 @@ def setup_model_symlink(llm_venv, model_root, dest_subpath):
294296
os.symlink(model_root, dst, target_is_directory=True)
295297

296298

299+
def resolve_llm_model_path(model_path: str) -> str:
300+
"""Resolve a model subpath relative to the test LLM model root."""
301+
if os.path.isabs(model_path):
302+
return model_path
303+
return os.path.join(llm_models_root(), model_path)
304+
305+
297306
ClientTestSet = namedtuple('ClientTestSet', [
298307
'completion', 'completion_streaming', 'chat', 'chat_streaming',
299308
'verify_completion', 'verify_streaming_completion', 'verify_chat',
@@ -445,6 +454,11 @@ def run_client_tests(example_dir,
445454
"The capital of Germany is Berlin",
446455
"Using `asyncio` in Python"
447456
]
457+
elif "qwen3_32b_fp8" in test_desc:
458+
expected_strings = [
459+
"The capital of Germany is Berlin",
460+
"Asyncio in Python is a library"
461+
]
448462
else:
449463
expected_strings = [
450464
"The capital of Germany is Berlin",
@@ -494,6 +508,13 @@ def setup_disagg_cluster(
494508
with open(config_file, 'r') as f:
495509
config = yaml.safe_load(f)
496510

511+
speculative_config = config.get("speculative_config")
512+
if isinstance(speculative_config, dict):
513+
speculative_model = speculative_config.get("speculative_model")
514+
if speculative_model:
515+
speculative_config["speculative_model"] = resolve_llm_model_path(
516+
speculative_model)
517+
497518
disagg_cluster = get_default_disagg_cluster_config()
498519
server_host = config.get("hostname", "localhost")
499520
server_port = get_free_port()
@@ -524,6 +545,8 @@ def setup_disagg_cluster(
524545

525546
# Launch workers
526547
model = model_name or config.get("model")
548+
if model:
549+
model = resolve_llm_model_path(model)
527550
ctx_workers = []
528551
gen_workers = []
529552
disagg_server = None
@@ -2087,6 +2110,22 @@ def test_disaggregated_gpt_oss_120b_harmony(disaggregated_test_root,
20872110
cwd=llm_venv.get_working_directory())
20882111

20892112

2113+
@skip_pre_hopper
2114+
@pytest.mark.skip_less_device(8)
2115+
@pytest.mark.parametrize("model_path", ['Qwen3/Qwen3-32B-FP8'])
2116+
def test_disaggregated_qwen3_32b_fp8(disaggregated_test_root,
2117+
disaggregated_example_root, llm_venv,
2118+
model_path):
2119+
model_dir = resolve_llm_model_path(model_path)
2120+
setup_model_symlink(llm_venv, model_dir, model_path)
2121+
2122+
run_disaggregated_test(disaggregated_example_root,
2123+
"qwen3_32b_fp8_stress",
2124+
env=llm_venv._new_env,
2125+
model_path=model_dir,
2126+
cwd=llm_venv.get_working_directory())
2127+
2128+
20902129
@pytest.mark.timeout(12600)
20912130
@pytest.mark.parametrize("test_config", [
20922131
pytest.param(TestConfig(model_path='DeepSeek-R1/DeepSeek-R1-0528-FP4-v2',
@@ -2099,6 +2138,11 @@ def test_disaggregated_gpt_oss_120b_harmony(disaggregated_test_root,
20992138
request_count=60000,
21002139
accuracy_threshold=0.42),
21012140
marks=(pytest.mark.skip_less_device(4), skip_pre_blackwell)),
2141+
pytest.param(TestConfig(model_path='Qwen3/Qwen3-32B-FP8',
2142+
test_desc='qwen3_32b_fp8_stress',
2143+
request_count=10000,
2144+
accuracy_threshold=0.42),
2145+
marks=(pytest.mark.skip_less_device(8), skip_pre_hopper)),
21022146
],
21032147
ids=lambda x: x.test_desc)
21042148
@pytest.mark.parametrize("concurrency", [512], ids=lambda x: f"conc{x}")
@@ -2113,7 +2157,7 @@ def test_disaggregated_stress_test(disaggregated_test_root,
21132157
# Unpack configuration from dataclass
21142158
model_path = test_config.model_path
21152159
test_desc = test_config.test_desc
2116-
model_dir = f"{llm_models_root()}/{model_path}"
2160+
model_dir = resolve_llm_model_path(model_path)
21172161
setup_model_symlink(llm_venv, model_dir, model_path)
21182162

21192163
config_file = get_test_config(test_desc, disaggregated_example_root,

tests/integration/test_lists/qa/llm_function_stress.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1-0528-FP4_tp4-stress
77
stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1-0528-FP4_tp4-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
88
disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-deepseek_r1_v2_fp4_stress]
99
disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-gpt_oss_120b_stress]
10+
disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-qwen3_32b_fp8_stress]
1011
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_fp8_8gpus
1112
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_nvfp4_4gpus
1213
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4_longseq_trtllm_moe_stress

tests/integration/test_lists/test-db/l0_dgx_h200.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ l0_dgx_h200:
4242
- disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0]
4343
- disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[TinyLlama-1.1B-Chat-v1.0]
4444
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ctxtp2ep2pp2_gentp4_one_mtp_block_reuse[DeepSeek-V3-Lite-fp8]
45+
- disaggregated/test_disaggregated.py::test_disaggregated_qwen3_32b_fp8[Qwen3/Qwen3-32B-FP8]
4546
- unittest/llmapi/test_llm_pytorch.py::test_nemotron_nas_lora
4647
- condition:
4748
ranges:

0 commit comments

Comments
 (0)