[TRTLLM-12154][test] Add Qwen3-32B FP8 disagg stress test

brnguyen2 · brnguyen2 · commit ce5f6a016bc8 · 2026-05-19T06:07:47.000-07:00
Initial wire-up for a Qwen3-32B FP8 disagg stress test on 8x H200 DGX
(4x TP1 prefill + 1x TP4 decode).

New disagg config (disagg_config_ctxtp1_gentp4_qwen3_32b_fp8.yaml)
exercises chunked prefill, KV block reuse across 4 ctx instances
(kv_cache_aware router + event buffer), FP8 KV cache, disagg cache
transfer, and the structured-output backend selection
(guided_decoding_backend: xgrammar).

Two test entries share the same YAML:
- test_disaggregated_qwen3_32b_fp8 (light): exercises the config end-to-
  end via the standard prompts.json client loop. Wired into
  l0_dgx_h200.yml post-merge so each merge to main verifies the config
  still loads and serves. Local pytest run completes in ~5-10 minutes.
- test_disaggregated_stress_test::qwen3_32b_fp8_stress: the long-running
  variant for the QA weekly stress lane (request_count=10000,
  accuracy_threshold=0.30 as conservative initial defaults; expect to
  tighten after the first baseline run). Wired into
  qa/llm_function_stress.txt alongside the existing deepseek/gpt-oss
  stress entries.

Marked skip_pre_hopper on both (vs the existing Blackwell-only entries)
because the target is H200.

Eagle3 is deferred (TODO in YAML): NVIDIA's HF speculative-decoding
collection doesn't currently ship a draft for dense Qwen3-32B, and
Eagle3 is mutually exclusive with enable_block_reuse when KV is FP8
per examples/models/core/qwen/README.md.

Signed-off-by: Brian Nguyen &lt;brnguyen@nvidia.com&gt;
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp4_qwen3_32b_fp8.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp4_qwen3_32b_fp8.yaml
@@ -0,0 +1,45 @@
+# TODO: Enable Eagle3 once a Qwen3-32B draft is available (TRTLLM-12154);
+# also requires turning off enable_block_reuse with FP8 KV.
+hostname: localhost
+model: Qwen3/Qwen3-32B-FP8
+backend: pytorch
+cuda_graph_config: null
+guided_decoding_backend: xgrammar
+context_servers:
+  num_instances: 4
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  router:
+    type: kv_cache_aware
+  enable_chunked_prefill: true
+  max_num_tokens: 4096
+  max_seq_len: 10240
+  max_batch_size: 128
+  disable_overlap_scheduler: true
+  print_iter_log: true
+  kv_cache_config:
+    enable_block_reuse: true
+    enable_partial_reuse: true
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    event_buffer_max_size: 1024
+  cache_transceiver_config:
+    backend: DEFAULT
+    max_tokens_in_buffer: 16384
+generation_servers:
+  num_instances: 1
+  tensor_parallel_size: 4
+  pipeline_parallel_size: 1
+  enable_chunked_prefill: true
+  max_num_tokens: 4096
+  max_seq_len: 10240
+  max_batch_size: 128
+  print_iter_log: true
+  kv_cache_config:
+    enable_block_reuse: true
+    enable_partial_reuse: true
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+  cache_transceiver_config:
+    backend: DEFAULT
+    max_tokens_in_buffer: 16384
diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -261,6 +261,8 @@ def get_test_config(test_desc, example_dir, test_root):
         f"{test_configs_root}/disagg_config_ctxtp4_gentp4_deepseek_r1_v2_fp4_tllm.yaml",
         "gpt_oss_120b_stress":
         f"{test_configs_root}/disagg_config_ctxtp2_gentp2_gptoss_tllm.yaml",
+        "qwen3_32b_fp8_stress":
+        f"{test_configs_root}/disagg_config_ctxtp1_gentp4_qwen3_32b_fp8.yaml",
         "gpt_oss_120b_harmony":
         f"{test_configs_root}/disagg_config_ctxtp2_gentp2_gptoss_tllm.yaml",
         "cancel_stress_test":
@@ -2087,6 +2089,22 @@ def test_disaggregated_gpt_oss_120b_harmony(disaggregated_test_root,
                            cwd=llm_venv.get_working_directory())
 
 
+@skip_pre_hopper
+@pytest.mark.skip_less_device(8)
+@pytest.mark.parametrize("model_path", ['Qwen3/Qwen3-32B-FP8'])
+def test_disaggregated_qwen3_32b_fp8(disaggregated_test_root,
+                                     disaggregated_example_root, llm_venv,
+                                     model_path):
+    model_dir = f"{llm_models_root()}/{model_path}"
+    setup_model_symlink(llm_venv, model_dir, model_path)
+
+    run_disaggregated_test(disaggregated_example_root,
+                           "qwen3_32b_fp8_stress",
+                           env=llm_venv._new_env,
+                           model_path=model_dir,
+                           cwd=llm_venv.get_working_directory())
+
+
 @pytest.mark.timeout(12600)
 @pytest.mark.parametrize("test_config", [
     pytest.param(TestConfig(model_path='DeepSeek-R1/DeepSeek-R1-0528-FP4-v2',
@@ -2099,6 +2117,11 @@ def test_disaggregated_gpt_oss_120b_harmony(disaggregated_test_root,
                             request_count=60000,
                             accuracy_threshold=0.42),
                  marks=(pytest.mark.skip_less_device(4), skip_pre_blackwell)),
+    pytest.param(TestConfig(model_path='Qwen3/Qwen3-32B-FP8',
+                            test_desc='qwen3_32b_fp8_stress',
+                            request_count=10000,
+                            accuracy_threshold=0.30),
+                 marks=(pytest.mark.skip_less_device(8), skip_pre_hopper)),
 ],
                          ids=lambda x: x.test_desc)
 @pytest.mark.parametrize("concurrency", [512], ids=lambda x: f"conc{x}")
diff --git a/tests/integration/test_lists/qa/llm_function_stress.txt b/tests/integration/test_lists/qa/llm_function_stress.txt
@@ -7,6 +7,7 @@ stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1-0528-FP4_tp4-stress
 stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1-0528-FP4_tp4-stress_time_3600s_timeout_10800s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
 disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-deepseek_r1_v2_fp4_stress]
 disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-gpt_oss_120b_stress]
+disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-qwen3_32b_fp8_stress]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_fp8_8gpus
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_nvfp4_4gpus
 accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4_longseq_trtllm_moe_stress
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200.yml b/tests/integration/test_lists/test-db/l0_dgx_h200.yml
@@ -42,6 +42,7 @@ l0_dgx_h200:
   - disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0]
   - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[TinyLlama-1.1B-Chat-v1.0]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ctxtp2ep2pp2_gentp4_one_mtp_block_reuse[DeepSeek-V3-Lite-fp8]
+  - disaggregated/test_disaggregated.py::test_disaggregated_qwen3_32b_fp8[Qwen3/Qwen3-32B-FP8]
   - unittest/llmapi/test_llm_pytorch.py::test_nemotron_nas_lora
 - condition:
     ranges: