NVIDIA · xinhe-nv · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
@@ -0,0 +1,59 @@
+model: GLM-5-NVFP4
+hostname: localhost
+backend: pytorch
+context_servers:
+  num_instances: 1
+  tensor_parallel_size: 4
+  pipeline_parallel_size: 1
+  moe_expert_parallel_size: 4
+  enable_attention_dp: true
+  max_num_tokens: 16640
+  max_seq_len: 8232
+  max_batch_size: 128
+  enable_chunked_prefill: true
+  kv_cache_config:
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.8
+    dtype: fp8
+  moe_config:
+    backend: TRTLLM
+  cuda_graph_config: null
+  print_iter_log: true
+  cache_transceiver_config:
+    backend: DEFAULT
+    max_tokens_in_buffer: 16384
+generation_servers:
+  num_instances: 1
+  tensor_parallel_size: 4
+  pipeline_parallel_size: 1
+  moe_expert_parallel_size: 4
+  enable_attention_dp: true
+  max_num_tokens: 10240
+  max_seq_len: 10240
+  max_batch_size: 128
+  enable_chunked_prefill: true
+  kv_cache_config:
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.8
+    dtype: fp8
+  moe_config:
+    backend: TRTLLM
+  cuda_graph_config:
+    enable_padding: true
+    batch_sizes:
+    - 1
+    - 2
+    - 4
+    - 8
+    - 16
+    - 32
+    - 64
+    - 128
+    - 256
+    - 512
+    - 768
+    - 1024
+  print_iter_log: true
+  cache_transceiver_config:
+    backend: DEFAULT
+    max_tokens_in_buffer: 16384
@@ -309,6 +309,8 @@ def get_test_config(test_desc, example_dir, test_root):
         f"{test_configs_root}/disagg_config_ctxtp2_gentp2_gptoss_triton.yaml",
         "qwen3_5_4b_fp8_stress":
         f"{test_configs_root}/disagg_config_ctxtp1_gentp1_qwen3_5_4b_fp8_tllm.yaml",
+        "glm5_nvfp4_tp4_ep4_dp_stress":
+        f"{test_configs_root}/disagg_config_ctxtp4ep4_gentp4ep4_glm5_nvfp4_dp_tllm.yaml",
         "qwen3_32b_fp8_stress":
         f"{test_configs_root}/disagg_config_ctxtp1_gentp4_qwen3_32b_fp8.yaml",
         "gpt_oss_120b_harmony":
@@ -2382,6 +2384,13 @@ def test_disaggregated_qwen3_32b_fp8(disaggregated_test_root,
                             cancellation_rate=10,
                             cancellation_delay=0.5),
                  marks=(pytest.mark.skip_less_device(2), skip_no_hopper)),
+    pytest.param(TestConfig(model_path='GLM-5-NVFP4',
+                            test_desc='glm5_nvfp4_tp4_ep4_dp_stress',
+                            request_count=35000,
+                            accuracy_threshold=0.90,
+                            cancellation_rate=10,
+                            cancellation_delay=0.5),
+                 marks=(pytest.mark.skip_less_device(8), skip_pre_blackwell)),
     pytest.param(TestConfig(model_path='Qwen3/Qwen3-32B-FP8',
                             test_desc='qwen3_32b_fp8_stress',
                             request_count=10000,

@@ -5,6 +5,7 @@ disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-outp
 disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-gpt_oss_120b_eagle_trtllm_stress]
 disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-gpt_oss_120b_triton_stress]
 disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-qwen3_5_4b_fp8_stress]
+disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-glm5_nvfp4_tp4_ep4_dp_stress]
 disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-qwen3_32b_fp8_stress]
 stress_test/disagg_cancel/test_disagg_cancel_stress.py::test_disagg_cancellation_marathon[marathon_cpp_v1_deepseek.yaml] TIMEOUT (45)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_fp8_8gpus