[TRTLLM-12154][test] Add Qwen3-32B FP8 disagg stress test (#14278)

brnguyen2 · web-flow · commit ccc0708ef6e9 · 2026-06-12T07:34:47.000+08:00
Signed-off-by: Brian Nguyen &lt;brnguyen@nvidia.com&gt;
diff --git a/tests/integration/defs/common.py b/tests/integration/defs/common.py
@@ -56,6 +56,15 @@ def _war_check_output(*args, **kwargs):
     return venv.run_cmd(cmd, caller=_war_check_output, env=env, **kwargs)
 
 
+def resolve_llm_model_path(model_path: str) -> str:
+    """Resolve a model subpath relative to the test LLM model root."""
+    if os.path.isabs(model_path):
+        return model_path
+
+    from .conftest import llm_models_root
+    return os.path.join(llm_models_root(), model_path)
+
+
 def venv_mpi_check_call(venv, mpi_cmd, python_cmd, **kwargs):
     """
     This function WAR check_call() to run python_cmd with mpi.
@@ -1287,9 +1296,11 @@ def parse_gsm8k_output(output_text: str) -> float:
         float: The accuracy value (0.7582 in the example)
     """
 
-    # Look for the specific pattern: |gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.7559|±  |0.0118|
+    # Look for the specific pattern:
+    # |gsm8k|...|flexible-extract|     5|exact_match|↑  |0.7559|±  |0.0118|
+    # lm-eval pads table cells, so allow whitespace around the value.
     patterns = [
-        r'flexible-extract\|\s+\d+\|exact_match\|\↑\s+\|(\d+\.\d+)',
+        r'flexible-extract\s*\|\s*\d+\s*\|\s*exact_match\s*\|\s*↑\s*\|\s*(\d+(?:\.\d+)?)',
     ]
 
     for pattern in patterns:
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp4_qwen3_32b_fp8.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp4_qwen3_32b_fp8.yaml
@@ -0,0 +1,54 @@
+hostname: localhost
+model: Qwen3/Qwen3-32B-FP8
+backend: pytorch
+cuda_graph_config: null
+guided_decoding_backend: xgrammar
+# speculative_config goes top-level so both context and generation workers
+# see it. Putting it under generation_servers: only causes the disagg KV
+# transceiver to reject the cacheState handshake (cacheStates differ when
+# only one side has spec config).
+speculative_config:
+  decoding_type: Eagle3
+  max_draft_len: 3
+  speculative_model: Zhi-Create-Qwen3-32B-Eagle3
+context_servers:
+  num_instances: 4
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  router:
+    type: kv_cache_aware
+  enable_chunked_prefill: true
+  max_num_tokens: 4096
+  max_seq_len: 10240
+  max_batch_size: 128
+  disable_overlap_scheduler: true
+  print_iter_log: true
+  kv_cache_config:
+    enable_block_reuse: true
+    enable_partial_reuse: true
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    event_buffer_max_size: 1024
+  cache_transceiver_config:
+    backend: DEFAULT
+    max_tokens_in_buffer: 16384
+generation_servers:
+  num_instances: 1
+  tensor_parallel_size: 4
+  pipeline_parallel_size: 1
+  enable_chunked_prefill: true
+  max_num_tokens: 4096
+  max_seq_len: 10240
+  max_batch_size: 128
+  # Eagle3 requires the overlap scheduler off. Cache settings must match the
+  # context block or disagg KV transfer rejects the cacheState handshake.
+  disable_overlap_scheduler: true
+  print_iter_log: true
+  kv_cache_config:
+    enable_block_reuse: true
+    enable_partial_reuse: true
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+  cache_transceiver_config:
+    backend: DEFAULT
+    max_tokens_in_buffer: 16384
diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -31,7 +31,8 @@
 import pytest
 import yaml
 from defs.common import get_free_port_in_ci as get_free_port
-from defs.common import parse_gsm8k_output, wait_for_server
+from defs.common import (parse_gsm8k_output, resolve_llm_model_path,
+                         wait_for_server)
 from defs.conftest import (get_sm_version, llm_models_root, skip_arm,
                            skip_no_hopper, skip_pre_blackwell, skip_pre_hopper)
 from defs.trt_test_alternative import check_call, check_output, print_info
@@ -308,6 +309,8 @@ def get_test_config(test_desc, example_dir, test_root):
         f"{test_configs_root}/disagg_config_ctxtp2_gentp2_gptoss_triton.yaml",
         "qwen3_5_4b_fp8_stress":
         f"{test_configs_root}/disagg_config_ctxtp1_gentp1_qwen3_5_4b_fp8_tllm.yaml",
+        "qwen3_32b_fp8_stress":
+        f"{test_configs_root}/disagg_config_ctxtp1_gentp4_qwen3_32b_fp8.yaml",
         "gpt_oss_120b_harmony":
         f"{test_configs_root}/disagg_config_ctxtp2_gentp2_gptoss_tllm.yaml",
         "cancel_stress_test":
@@ -492,6 +495,11 @@ def run_client_tests(example_dir,
                         "The capital of Germany is Berlin",
                         "Using `asyncio` in Python"
                     ]
+                elif "qwen3_32b_fp8" in test_desc:
+                    expected_strings = [
+                        "The capital of Germany is Berlin",
+                        "Asyncio in Python is a library"
+                    ]
                 else:
                     expected_strings = [
                         "The capital of Germany is Berlin",
@@ -618,6 +626,13 @@ def setup_disagg_cluster(
     with open(config_file, 'r') as f:
         config = yaml.safe_load(f)
 
+    speculative_config = config.get("speculative_config")
+    if isinstance(speculative_config, dict):
+        speculative_model = speculative_config.get("speculative_model")
+        if speculative_model:
+            speculative_config["speculative_model"] = resolve_llm_model_path(
+                speculative_model)
+
     disagg_cluster = get_default_disagg_cluster_config()
     server_host = config.get("hostname", "localhost")
     server_port = get_free_port()
@@ -648,6 +663,8 @@ def setup_disagg_cluster(
 
     # Launch workers
     model = model_name or config.get("model")
+    if model:
+        model = resolve_llm_model_path(model)
     ctx_workers = []
     gen_workers = []
     disagg_server = None
@@ -2288,6 +2305,22 @@ def test_disaggregated_gpt_oss_120b_harmony(disaggregated_test_root,
                            cwd=llm_venv.get_working_directory())
 
 
+@skip_pre_hopper
+@pytest.mark.skip_less_device(8)
+@pytest.mark.parametrize("model_path", ['Qwen3/Qwen3-32B-FP8'])
+def test_disaggregated_qwen3_32b_fp8(disaggregated_test_root,
+                                     disaggregated_example_root, llm_venv,
+                                     model_path):
+    model_dir = resolve_llm_model_path(model_path)
+    setup_model_symlink(llm_venv, model_dir, model_path)
+
+    run_disaggregated_test(disaggregated_example_root,
+                           "qwen3_32b_fp8_stress",
+                           env=llm_venv._new_env,
+                           model_path=model_dir,
+                           cwd=llm_venv.get_working_directory())
+
+
 @pytest.mark.timeout(12600)
 @pytest.mark.parametrize("test_config", [
     pytest.param(TestConfig(model_path='DeepSeek-R1/DeepSeek-R1-0528-FP4-v2',
@@ -2349,6 +2382,11 @@ def test_disaggregated_gpt_oss_120b_harmony(disaggregated_test_root,
                             cancellation_rate=10,
                             cancellation_delay=0.5),
                  marks=(pytest.mark.skip_less_device(2), skip_no_hopper)),
+    pytest.param(TestConfig(model_path='Qwen3/Qwen3-32B-FP8',
+                            test_desc='qwen3_32b_fp8_stress',
+                            request_count=10000,
+                            accuracy_threshold=0.42),
+                 marks=(pytest.mark.skip_less_device(8), skip_pre_hopper)),
 ],
                          ids=lambda x: x.test_desc)
 @pytest.mark.parametrize("concurrency", [512], ids=lambda x: f"conc{x}")
@@ -2363,7 +2401,7 @@ def test_disaggregated_stress_test(disaggregated_test_root,
     # Unpack configuration from dataclass
     model_path = test_config.model_path
     test_desc = test_config.test_desc
-    model_dir = f"{llm_models_root()}/{model_path}"
+    model_dir = resolve_llm_model_path(model_path)
     setup_model_symlink(llm_venv, model_dir, model_path)
 
     config_file = get_test_config(test_desc, disaggregated_example_root,
diff --git a/tests/integration/test_lists/qa/llm_function_stress.txt b/tests/integration/test_lists/qa/llm_function_stress.txt
@@ -5,6 +5,7 @@ disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-outp
 disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-gpt_oss_120b_eagle_trtllm_stress]
 disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-gpt_oss_120b_triton_stress]
 disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-qwen3_5_4b_fp8_stress]
+disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-qwen3_32b_fp8_stress]
 stress_test/disagg_cancel/test_disagg_cancel_stress.py::test_disagg_cancellation_marathon[marathon_cpp_v1_deepseek.yaml] TIMEOUT (45)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_fp8_8gpus
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_nvfp4_4gpus
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200.yml b/tests/integration/test_lists/test-db/l0_dgx_h200.yml
@@ -40,6 +40,7 @@ l0_dgx_h200:
   - disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0]
   - disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[TinyLlama-1.1B-Chat-v1.0]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ctxtp2ep2pp2_gentp4_one_mtp_block_reuse[DeepSeek-V3-Lite-fp8]
+  - disaggregated/test_disaggregated.py::test_disaggregated_qwen3_32b_fp8[Qwen3/Qwen3-32B-FP8]
   - unittest/llmapi/test_llm_pytorch.py::test_nemotron_nas_lora
   - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_gen_only_spec_dec
 - condition:
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -86,6 +86,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.5-fp8kv=False] SKIP (https://nvbugs/6260915)
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.5-fp8kv=True] SKIP (https://nvbugs/6248783)
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.9-fp8kv=False] SKIP (https://nvbugs/6260915)
+accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_bf16_mtp[mtp_on] SKIP (https://nvbugs/6206179)
 accuracy/test_llm_api_pytorch.py::TestQwen3_5_397B_A17B::test_nvfp4[tep4_cutedsl] SKIP (https://nvbugs/6255417)
 accuracy/test_llm_api_pytorch.py::TestQwen3_5_4B::test_bf16 SKIP (https://nvbugs/6283537)
 accuracy/test_llm_api_pytorch.py::TestQwen3_5_9B::test_bf16[mtp_off] SKIP (https://nvbugs/6212250)