Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions tests/integration/defs/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,15 @@ def _war_check_output(*args, **kwargs):
return venv.run_cmd(cmd, caller=_war_check_output, env=env, **kwargs)


def resolve_llm_model_path(model_path: str) -> str:
"""Resolve a model subpath relative to the test LLM model root."""
if os.path.isabs(model_path):
return model_path

from .conftest import llm_models_root
return os.path.join(llm_models_root(), model_path)


def venv_mpi_check_call(venv, mpi_cmd, python_cmd, **kwargs):
"""
This function WAR check_call() to run python_cmd with mpi.
Expand Down Expand Up @@ -1287,9 +1296,11 @@ def parse_gsm8k_output(output_text: str) -> float:
float: The accuracy value (0.7582 in the example)
"""

# Look for the specific pattern: |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.7559|± |0.0118|
# Look for the specific pattern:
# |gsm8k|...|flexible-extract| 5|exact_match|↑ |0.7559|± |0.0118|
# lm-eval pads table cells, so allow whitespace around the value.
patterns = [
r'flexible-extract\|\s+\d+\|exact_match\|\↑\s+\|(\d+\.\d+)',
r'flexible-extract\s*\|\s*\d+\s*\|\s*exact_match\s*\|\s*↑\s*\|\s*(\d+(?:\.\d+)?)',
]

for pattern in patterns:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
hostname: localhost
model: Qwen3/Qwen3-32B-FP8
backend: pytorch
cuda_graph_config: null
guided_decoding_backend: xgrammar
# speculative_config goes top-level so both context and generation workers
# see it. Putting it under generation_servers: only causes the disagg KV
# transceiver to reject the cacheState handshake (cacheStates differ when
# only one side has spec config).
speculative_config:
decoding_type: Eagle3
max_draft_len: 3
speculative_model: Zhi-Create-Qwen3-32B-Eagle3
context_servers:
num_instances: 4
tensor_parallel_size: 1
pipeline_parallel_size: 1
router:
type: kv_cache_aware
enable_chunked_prefill: true
max_num_tokens: 4096
max_seq_len: 10240
max_batch_size: 128
disable_overlap_scheduler: true
print_iter_log: true
kv_cache_config:
enable_block_reuse: true
enable_partial_reuse: true
dtype: fp8
free_gpu_memory_fraction: 0.8
event_buffer_max_size: 1024
cache_transceiver_config:
backend: DEFAULT
max_tokens_in_buffer: 16384
generation_servers:
num_instances: 1
tensor_parallel_size: 4
pipeline_parallel_size: 1
enable_chunked_prefill: true
max_num_tokens: 4096
max_seq_len: 10240
max_batch_size: 128
# Eagle3 requires the overlap scheduler off. Cache settings must match the
# context block or disagg KV transfer rejects the cacheState handshake.
disable_overlap_scheduler: true
print_iter_log: true
kv_cache_config:
enable_block_reuse: true
enable_partial_reuse: true
dtype: fp8
free_gpu_memory_fraction: 0.8
cache_transceiver_config:
backend: DEFAULT
max_tokens_in_buffer: 16384
42 changes: 40 additions & 2 deletions tests/integration/defs/disaggregated/test_disaggregated.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
import pytest
import yaml
from defs.common import get_free_port_in_ci as get_free_port
from defs.common import parse_gsm8k_output, wait_for_server
from defs.common import (parse_gsm8k_output, resolve_llm_model_path,
wait_for_server)
from defs.conftest import (get_sm_version, llm_models_root, skip_arm,
skip_no_hopper, skip_pre_blackwell, skip_pre_hopper)
from defs.trt_test_alternative import check_call, check_output, print_info
Expand Down Expand Up @@ -308,6 +309,8 @@ def get_test_config(test_desc, example_dir, test_root):
f"{test_configs_root}/disagg_config_ctxtp2_gentp2_gptoss_triton.yaml",
"qwen3_5_4b_fp8_stress":
f"{test_configs_root}/disagg_config_ctxtp1_gentp1_qwen3_5_4b_fp8_tllm.yaml",
"qwen3_32b_fp8_stress":
f"{test_configs_root}/disagg_config_ctxtp1_gentp4_qwen3_32b_fp8.yaml",
"gpt_oss_120b_harmony":
f"{test_configs_root}/disagg_config_ctxtp2_gentp2_gptoss_tllm.yaml",
"cancel_stress_test":
Expand Down Expand Up @@ -492,6 +495,11 @@ def run_client_tests(example_dir,
"The capital of Germany is Berlin",
"Using `asyncio` in Python"
]
elif "qwen3_32b_fp8" in test_desc:
expected_strings = [
"The capital of Germany is Berlin",
"Asyncio in Python is a library"
]
else:
expected_strings = [
"The capital of Germany is Berlin",
Expand Down Expand Up @@ -618,6 +626,13 @@ def setup_disagg_cluster(
with open(config_file, 'r') as f:
config = yaml.safe_load(f)

speculative_config = config.get("speculative_config")
if isinstance(speculative_config, dict):
speculative_model = speculative_config.get("speculative_model")
if speculative_model:
speculative_config["speculative_model"] = resolve_llm_model_path(
speculative_model)

disagg_cluster = get_default_disagg_cluster_config()
server_host = config.get("hostname", "localhost")
server_port = get_free_port()
Expand Down Expand Up @@ -648,6 +663,8 @@ def setup_disagg_cluster(

# Launch workers
model = model_name or config.get("model")
if model:
model = resolve_llm_model_path(model)
ctx_workers = []
gen_workers = []
disagg_server = None
Expand Down Expand Up @@ -2288,6 +2305,22 @@ def test_disaggregated_gpt_oss_120b_harmony(disaggregated_test_root,
cwd=llm_venv.get_working_directory())


@skip_pre_hopper
@pytest.mark.skip_less_device(8)
@pytest.mark.parametrize("model_path", ['Qwen3/Qwen3-32B-FP8'])
def test_disaggregated_qwen3_32b_fp8(disaggregated_test_root,
disaggregated_example_root, llm_venv,
model_path):
model_dir = resolve_llm_model_path(model_path)
setup_model_symlink(llm_venv, model_dir, model_path)

run_disaggregated_test(disaggregated_example_root,
"qwen3_32b_fp8_stress",
env=llm_venv._new_env,
model_path=model_dir,
cwd=llm_venv.get_working_directory())


@pytest.mark.timeout(12600)
@pytest.mark.parametrize("test_config", [
pytest.param(TestConfig(model_path='DeepSeek-R1/DeepSeek-R1-0528-FP4-v2',
Expand Down Expand Up @@ -2349,6 +2382,11 @@ def test_disaggregated_gpt_oss_120b_harmony(disaggregated_test_root,
cancellation_rate=10,
cancellation_delay=0.5),
marks=(pytest.mark.skip_less_device(2), skip_no_hopper)),
pytest.param(TestConfig(model_path='Qwen3/Qwen3-32B-FP8',
test_desc='qwen3_32b_fp8_stress',
request_count=10000,
accuracy_threshold=0.42),
marks=(pytest.mark.skip_less_device(8), skip_pre_hopper)),
],
ids=lambda x: x.test_desc)
@pytest.mark.parametrize("concurrency", [512], ids=lambda x: f"conc{x}")
Expand All @@ -2363,7 +2401,7 @@ def test_disaggregated_stress_test(disaggregated_test_root,
# Unpack configuration from dataclass
model_path = test_config.model_path
test_desc = test_config.test_desc
model_dir = f"{llm_models_root()}/{model_path}"
model_dir = resolve_llm_model_path(model_path)
setup_model_symlink(llm_venv, model_dir, model_path)

config_file = get_test_config(test_desc, disaggregated_example_root,
Expand Down
1 change: 1 addition & 0 deletions tests/integration/test_lists/qa/llm_function_stress.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-outp
disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-gpt_oss_120b_eagle_trtllm_stress]
disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-gpt_oss_120b_triton_stress]
disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-qwen3_5_4b_fp8_stress]
disaggregated/test_disaggregated.py::test_disaggregated_stress_test[input8k-output1k-conc512-qwen3_32b_fp8_stress]
stress_test/disagg_cancel/test_disagg_cancel_stress.py::test_disagg_cancellation_marathon[marathon_cpp_v1_deepseek.yaml] TIMEOUT (45)
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_fp8_8gpus
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_nvfp4_4gpus
Expand Down
1 change: 1 addition & 0 deletions tests/integration/test_lists/test-db/l0_dgx_h200.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ l0_dgx_h200:
- disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0]
- disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[TinyLlama-1.1B-Chat-v1.0]
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ctxtp2ep2pp2_gentp4_one_mtp_block_reuse[DeepSeek-V3-Lite-fp8]
- disaggregated/test_disaggregated.py::test_disaggregated_qwen3_32b_fp8[Qwen3/Qwen3-32B-FP8]
- unittest/llmapi/test_llm_pytorch.py::test_nemotron_nas_lora
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_gen_only_spec_dec
- condition:
Expand Down
1 change: 1 addition & 0 deletions tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.5-fp8kv=False] SKIP (https://nvbugs/6260915)
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.5-fp8kv=True] SKIP (https://nvbugs/6248783)
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_4gpus[target_sparsity_0.9-fp8kv=False] SKIP (https://nvbugs/6260915)
accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_bf16_mtp[mtp_on] SKIP (https://nvbugs/6206179)
accuracy/test_llm_api_pytorch.py::TestQwen3_5_397B_A17B::test_nvfp4[tep4_cutedsl] SKIP (https://nvbugs/6255417)
accuracy/test_llm_api_pytorch.py::TestQwen3_5_4B::test_bf16 SKIP (https://nvbugs/6283537)
accuracy/test_llm_api_pytorch.py::TestQwen3_5_9B::test_bf16[mtp_off] SKIP (https://nvbugs/6212250)
Expand Down
Loading