From 21556fa4729e6d6073d44a0d0fc5831695df40ea Mon Sep 17 00:00:00 2001 From: Brian Nguyen Date: Wed, 20 May 2026 11:20:49 -0700 Subject: [PATCH 1/4] [TRTLLM-12154][test] Add Qwen3-32B FP8 Eagle3 disagg stress test Add a Qwen3-32B FP8 disaggregated serving smoke and stress test that exercises Eagle3 with 4x TP1 context workers and 1x TP4 generation worker on 8 GPUs. The YAML enables FP8 KV cache, chunked prefill, block and partial reuse, cache transfer, and a top-level Eagle3 speculative_config shared by context and generation workers. The draft model is stored as a model-root-relative path and the disagg harness now resolves relative model and speculative_model values through llm_models_root while preserving absolute paths. Wire the smoke test into the H200 L0 list and the full 10k-request stress case into the QA stress list. Add Qwen-specific output substring checks and keep the stress accuracy threshold aligned with the adjacent GPT-OSS stress case. Signed-off-by: Brian Nguyen --- tests/integration/defs/common.py | 6 ++- ...gg_config_ctxtp1_gentp4_qwen3_32b_fp8.yaml | 54 +++++++++++++++++++ .../defs/disaggregated/test_disaggregated.py | 46 +++++++++++++++- .../test_lists/qa/llm_function_stress.txt | 1 + .../test_lists/test-db/l0_dgx_h200.yml | 1 + 5 files changed, 105 insertions(+), 3 deletions(-) create mode 100644 tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp4_qwen3_32b_fp8.yaml diff --git a/tests/integration/defs/common.py b/tests/integration/defs/common.py index e56772973943..2bf5cbd89bd1 100644 --- a/tests/integration/defs/common.py +++ b/tests/integration/defs/common.py @@ -1287,9 +1287,11 @@ def parse_gsm8k_output(output_text: str) -> float: float: The accuracy value (0.7582 in the example) """ - # Look for the specific pattern: |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.7559|± |0.0118| + # Look for the specific pattern: + # |gsm8k|...|flexible-extract| 5|exact_match|↑ |0.7559|± |0.0118| + # lm-eval pads table cells, so allow whitespace around the value. patterns = [ - r'flexible-extract\|\s+\d+\|exact_match\|\↑\s+\|(\d+\.\d+)', + r'flexible-extract\s*\|\s*\d+\s*\|\s*exact_match\s*\|\s*↑\s*\|\s*(\d+(?:\.\d+)?)', ] for pattern in patterns: diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp4_qwen3_32b_fp8.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp4_qwen3_32b_fp8.yaml new file mode 100644 index 000000000000..54e5b2bd8942 --- /dev/null +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp4_qwen3_32b_fp8.yaml @@ -0,0 +1,54 @@ +hostname: localhost +model: Qwen3/Qwen3-32B-FP8 +backend: pytorch +cuda_graph_config: null +guided_decoding_backend: xgrammar +# speculative_config goes top-level so both context and generation workers +# see it. Putting it under generation_servers: only causes the disagg KV +# transceiver to reject the cacheState handshake (cacheStates differ when +# only one side has spec config). +speculative_config: + decoding_type: Eagle3 + max_draft_len: 3 + speculative_model: Zhi-Create-Qwen3-32B-Eagle3 +context_servers: + num_instances: 4 + tensor_parallel_size: 1 + pipeline_parallel_size: 1 + router: + type: kv_cache_aware + enable_chunked_prefill: true + max_num_tokens: 4096 + max_seq_len: 10240 + max_batch_size: 128 + disable_overlap_scheduler: true + print_iter_log: true + kv_cache_config: + enable_block_reuse: true + enable_partial_reuse: true + dtype: fp8 + free_gpu_memory_fraction: 0.8 + event_buffer_max_size: 1024 + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 16384 +generation_servers: + num_instances: 1 + tensor_parallel_size: 4 + pipeline_parallel_size: 1 + enable_chunked_prefill: true + max_num_tokens: 4096 + max_seq_len: 10240 + max_batch_size: 128 + # Eagle3 requires the overlap scheduler off. Cache settings must match the + # context block or disagg KV transfer rejects the cacheState handshake. + disable_overlap_scheduler: true + print_iter_log: true + kv_cache_config: + enable_block_reuse: true + enable_partial_reuse: true + dtype: fp8 + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: DEFAULT + max_tokens_in_buffer: 16384 diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py index a5beb9c2de26..18a25e76836c 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated.py +++ b/tests/integration/defs/disaggregated/test_disaggregated.py @@ -308,6 +308,8 @@ def get_test_config(test_desc, example_dir, test_root): f"{test_configs_root}/disagg_config_ctxtp2_gentp2_gptoss_triton.yaml", "qwen3_5_4b_fp8_stress": f"{test_configs_root}/disagg_config_ctxtp1_gentp1_qwen3_5_4b_fp8_tllm.yaml", + "qwen3_32b_fp8_stress": + f"{test_configs_root}/disagg_config_ctxtp1_gentp4_qwen3_32b_fp8.yaml", "gpt_oss_120b_harmony": f"{test_configs_root}/disagg_config_ctxtp2_gentp2_gptoss_tllm.yaml", "cancel_stress_test": @@ -341,6 +343,13 @@ def setup_model_symlink(llm_venv, model_root, dest_subpath): os.symlink(model_root, dst, target_is_directory=True) +def resolve_llm_model_path(model_path: str) -> str: + """Resolve a model subpath relative to the test LLM model root.""" + if os.path.isabs(model_path): + return model_path + return os.path.join(llm_models_root(), model_path) + + ClientTestSet = namedtuple('ClientTestSet', [ 'completion', 'completion_streaming', 'chat', 'chat_streaming', 'verify_completion', 'verify_streaming_completion', 'verify_chat', @@ -492,6 +501,11 @@ def run_client_tests(example_dir, "The capital of Germany is Berlin", "Using `asyncio` in Python" ] + elif "qwen3_32b_fp8" in test_desc: + expected_strings = [ + "The capital of Germany is Berlin", + "Asyncio in Python is a library" + ] else: expected_strings = [ "The capital of Germany is Berlin", @@ -618,6 +632,13 @@ def setup_disagg_cluster( with open(config_file, 'r') as f: config = yaml.safe_load(f) + speculative_config = config.get("speculative_config") + if isinstance(speculative_config, dict): + speculative_model = speculative_config.get("speculative_model") + if speculative_model: + speculative_config["speculative_model"] = resolve_llm_model_path( + speculative_model) + disagg_cluster = get_default_disagg_cluster_config() server_host = config.get("hostname", "localhost") server_port = get_free_port() @@ -648,6 +669,8 @@ def setup_disagg_cluster( # Launch workers model = model_name or config.get("model") + if model: + model = resolve_llm_model_path(model) ctx_workers = [] gen_workers = [] disagg_server = None @@ -2288,6 +2311,22 @@ def test_disaggregated_gpt_oss_120b_harmony(disaggregated_test_root, cwd=llm_venv.get_working_directory()) +@skip_pre_hopper +@pytest.mark.skip_less_device(8) +@pytest.mark.parametrize("model_path", ['Qwen3/Qwen3-32B-FP8']) +def test_disaggregated_qwen3_32b_fp8(disaggregated_test_root, + disaggregated_example_root, llm_venv, + model_path): + model_dir = resolve_llm_model_path(model_path) + setup_model_symlink(llm_venv, model_dir, model_path) + + run_disaggregated_test(disaggregated_example_root, + "qwen3_32b_fp8_stress", + env=llm_venv._new_env, + model_path=model_dir, + cwd=llm_venv.get_working_directory()) + + @pytest.mark.timeout(12600) @pytest.mark.parametrize("test_config", [ pytest.param(TestConfig(model_path='DeepSeek-R1/DeepSeek-R1-0528-FP4-v2', @@ -2349,6 +2388,11 @@ def test_disaggregated_gpt_oss_120b_harmony(disaggregated_test_root, cancellation_rate=10, cancellation_delay=0.5), marks=(pytest.mark.skip_less_device(2), skip_no_hopper)), + pytest.param(TestConfig(model_path='Qwen3/Qwen3-32B-FP8', + test_desc='qwen3_32b_fp8_stress', + request_count=10000, + accuracy_threshold=0.42), + marks=(pytest.mark.skip_less_device(8), skip_pre_hopper)), ], ids=lambda x: x.test_desc) @pytest.mark.parametrize("concurrency", [512], ids=lambda x: f"conc{x}") @@ -2363,7 +2407,7 @@ def test_disaggregated_stress_test(disaggregated_test_root, # Unpack configuration from dataclass model_path = test_config.model_path test_desc = test_config.test_desc - model_dir = f"{llm_models_root()}/{model_path}" + model_dir = resolve_llm_model_path(model_path) setup_model_symlink(llm_venv, model_dir, model_path) config_file = get_test_config(test_desc, disaggregated_example_root, diff --git a/tests/integration/test_lists/qa/llm_function_stress.txt b/tests/integration/test_lists/qa/llm_function_stress.txt index 9d31b6149c46..677cafe85289 100644 --- a/tests/integration/test_lists/qa/llm_function_stress.txt +++ b/tests/integration/test_lists/qa/llm_function_stress.txt @@ -5,6 +5,7 @@ disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-outp disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-gpt_oss_120b_eagle_trtllm_stress] disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-gpt_oss_120b_triton_stress] disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-qwen3_5_4b_fp8_stress] +disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-qwen3_32b_fp8_stress] stress_test/disagg_cancel/test_disagg_cancel_stress.py::test_disagg_cancellation_marathon[marathon_cpp_v1_deepseek.yaml] TIMEOUT (45) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_fp8_8gpus accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_nvfp4_4gpus diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200.yml b/tests/integration/test_lists/test-db/l0_dgx_h200.yml index aa468157ea3a..86b3512b69d2 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h200.yml @@ -40,6 +40,7 @@ l0_dgx_h200: - disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0] - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[TinyLlama-1.1B-Chat-v1.0] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ctxtp2ep2pp2_gentp4_one_mtp_block_reuse[DeepSeek-V3-Lite-fp8] + - disaggregated/test_disaggregated.py::test_disaggregated_qwen3_32b_fp8[Qwen3/Qwen3-32B-FP8] - unittest/llmapi/test_llm_pytorch.py::test_nemotron_nas_lora - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_gen_only_spec_dec - condition: From 392fd4bbc906663251ff3929907d1294c025e211 Mon Sep 17 00:00:00 2001 From: Brian Nguyen Date: Wed, 20 May 2026 19:44:58 -0500 Subject: [PATCH 2/4] [TRTLLM-12154][test] Address disagg helper naming nit Signed-off-by: Brian Nguyen --- .../defs/disaggregated/test_disaggregated.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py index 18a25e76836c..26590f3f61bb 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated.py +++ b/tests/integration/defs/disaggregated/test_disaggregated.py @@ -343,7 +343,7 @@ def setup_model_symlink(llm_venv, model_root, dest_subpath): os.symlink(model_root, dst, target_is_directory=True) -def resolve_llm_model_path(model_path: str) -> str: +def _resolve_llm_model_path(model_path: str) -> str: """Resolve a model subpath relative to the test LLM model root.""" if os.path.isabs(model_path): return model_path @@ -636,7 +636,7 @@ def setup_disagg_cluster( if isinstance(speculative_config, dict): speculative_model = speculative_config.get("speculative_model") if speculative_model: - speculative_config["speculative_model"] = resolve_llm_model_path( + speculative_config["speculative_model"] = _resolve_llm_model_path( speculative_model) disagg_cluster = get_default_disagg_cluster_config() @@ -670,7 +670,7 @@ def setup_disagg_cluster( # Launch workers model = model_name or config.get("model") if model: - model = resolve_llm_model_path(model) + model = _resolve_llm_model_path(model) ctx_workers = [] gen_workers = [] disagg_server = None @@ -2317,7 +2317,7 @@ def test_disaggregated_gpt_oss_120b_harmony(disaggregated_test_root, def test_disaggregated_qwen3_32b_fp8(disaggregated_test_root, disaggregated_example_root, llm_venv, model_path): - model_dir = resolve_llm_model_path(model_path) + model_dir = _resolve_llm_model_path(model_path) setup_model_symlink(llm_venv, model_dir, model_path) run_disaggregated_test(disaggregated_example_root, @@ -2407,7 +2407,7 @@ def test_disaggregated_stress_test(disaggregated_test_root, # Unpack configuration from dataclass model_path = test_config.model_path test_desc = test_config.test_desc - model_dir = resolve_llm_model_path(model_path) + model_dir = _resolve_llm_model_path(model_path) setup_model_symlink(llm_venv, model_dir, model_path) config_file = get_test_config(test_desc, disaggregated_example_root, From 0c223457bdfbba12b96b4affb8e974b8d530d7e0 Mon Sep 17 00:00:00 2001 From: Brian Nguyen Date: Thu, 21 May 2026 14:29:12 -0500 Subject: [PATCH 3/4] [TRTLLM-12154][test] Move model path helper to common Signed-off-by: Brian Nguyen --- tests/integration/defs/common.py | 9 +++++++++ .../defs/disaggregated/test_disaggregated.py | 18 ++++++------------ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/tests/integration/defs/common.py b/tests/integration/defs/common.py index 2bf5cbd89bd1..ac4ef8f2b7b7 100644 --- a/tests/integration/defs/common.py +++ b/tests/integration/defs/common.py @@ -56,6 +56,15 @@ def _war_check_output(*args, **kwargs): return venv.run_cmd(cmd, caller=_war_check_output, env=env, **kwargs) +def resolve_llm_model_path(model_path: str) -> str: + """Resolve a model subpath relative to the test LLM model root.""" + if os.path.isabs(model_path): + return model_path + + from .conftest import llm_models_root + return os.path.join(llm_models_root(), model_path) + + def venv_mpi_check_call(venv, mpi_cmd, python_cmd, **kwargs): """ This function WAR check_call() to run python_cmd with mpi. diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py index 26590f3f61bb..96f9c9ef4115 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated.py +++ b/tests/integration/defs/disaggregated/test_disaggregated.py @@ -31,7 +31,8 @@ import pytest import yaml from defs.common import get_free_port_in_ci as get_free_port -from defs.common import parse_gsm8k_output, wait_for_server +from defs.common import (parse_gsm8k_output, resolve_llm_model_path, + wait_for_server) from defs.conftest import (get_sm_version, llm_models_root, skip_arm, skip_no_hopper, skip_pre_blackwell, skip_pre_hopper) from defs.trt_test_alternative import check_call, check_output, print_info @@ -343,13 +344,6 @@ def setup_model_symlink(llm_venv, model_root, dest_subpath): os.symlink(model_root, dst, target_is_directory=True) -def _resolve_llm_model_path(model_path: str) -> str: - """Resolve a model subpath relative to the test LLM model root.""" - if os.path.isabs(model_path): - return model_path - return os.path.join(llm_models_root(), model_path) - - ClientTestSet = namedtuple('ClientTestSet', [ 'completion', 'completion_streaming', 'chat', 'chat_streaming', 'verify_completion', 'verify_streaming_completion', 'verify_chat', @@ -636,7 +630,7 @@ def setup_disagg_cluster( if isinstance(speculative_config, dict): speculative_model = speculative_config.get("speculative_model") if speculative_model: - speculative_config["speculative_model"] = _resolve_llm_model_path( + speculative_config["speculative_model"] = resolve_llm_model_path( speculative_model) disagg_cluster = get_default_disagg_cluster_config() @@ -670,7 +664,7 @@ def setup_disagg_cluster( # Launch workers model = model_name or config.get("model") if model: - model = _resolve_llm_model_path(model) + model = resolve_llm_model_path(model) ctx_workers = [] gen_workers = [] disagg_server = None @@ -2317,7 +2311,7 @@ def test_disaggregated_gpt_oss_120b_harmony(disaggregated_test_root, def test_disaggregated_qwen3_32b_fp8(disaggregated_test_root, disaggregated_example_root, llm_venv, model_path): - model_dir = _resolve_llm_model_path(model_path) + model_dir = resolve_llm_model_path(model_path) setup_model_symlink(llm_venv, model_dir, model_path) run_disaggregated_test(disaggregated_example_root, @@ -2407,7 +2401,7 @@ def test_disaggregated_stress_test(disaggregated_test_root, # Unpack configuration from dataclass model_path = test_config.model_path test_desc = test_config.test_desc - model_dir = _resolve_llm_model_path(model_path) + model_dir = resolve_llm_model_path(model_path) setup_model_symlink(llm_venv, model_dir, model_path) config_file = get_test_config(test_desc, disaggregated_example_root, From 0bedcb2528dc431aa1ff26f4fdde3b218a1f1206 Mon Sep 17 00:00:00 2001 From: Brian Nguyen Date: Fri, 22 May 2026 06:39:47 -0500 Subject: [PATCH 4/4] [https://nvbugs/6206179][test] Waive failing TestQwen3_5_35B_A3B::test_bf16_mtp[mtp_on] The test crashes during autotuner warmup with 'NoneType' object has no attribute 'gather_ids' at modeling_speculative.py:1748 when MTP eagle one-model is combined with Qwen3.5-35B-A3B. Pre-existing regression on main introduced by the Qwen3.5 VL MoE landing (96a4a0937e); unrelated to this PR's changes. Tracked in https://nvbugs/6206179. Signed-off-by: Brian Nguyen --- tests/integration/test_lists/waives.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 452eb5ce73d7..0697d2256884 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -86,6 +86,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.5-fp8kv=False] SKIP (https://nvbugs/6260915) accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.5-fp8kv=True] SKIP (https://nvbugs/6248783) accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.9-fp8kv=False] SKIP (https://nvbugs/6260915) +accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_bf16_mtp[mtp_on] SKIP (https://nvbugs/6206179) accuracy/test_llm_api_pytorch.py::TestQwen3_5_397B_A17B::test_nvfp4[tep4_cutedsl] SKIP (https://nvbugs/6255417) accuracy/test_llm_api_pytorch.py::TestQwen3_5_4B::test_bf16 SKIP (https://nvbugs/6283537) accuracy/test_llm_api_pytorch.py::TestQwen3_5_9B::test_bf16[mtp_off] SKIP (https://nvbugs/6212250)