Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions tensorrt_llm/_torch/attention_backend/flashinfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,12 @@
except RuntimeError:
# Override TORCH_CUDA_ARCH_LIST for JIT compilation of flashinfer kernels
# since the existed TORCH_CUDA_ARCH_LIST may be too general and flashinfer requires sm75+.
capability = torch.cuda.get_device_capability()
arch_list = f"{capability[0]}.{capability[1]}"
os.environ["TORCH_CUDA_ARCH_LIST"] = arch_list
# Guard on a visible GPU: with CUDA_VISIBLE_DEVICES="" (pure client) the
# capability query would force a CUDA context at import time.
if torch.cuda.is_available():
capability = torch.cuda.get_device_capability()
arch_list = f"{capability[0]}.{capability[1]}"
os.environ["TORCH_CUDA_ARCH_LIST"] = arch_list

from tensorrt_llm._utils import prefer_pinned

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,9 @@ def _add_layer(
# the typical step. An all-generation typical_step over-provisions the
# compressed-cache pool at the expense of the SWA pool, starving the
# SWA pool and artificially capping the achievable batch size.
ctx_capacity = max_num_tokens if max_num_tokens is not None else max_seq_len
ctx_capacity = (
max_num_tokens if max_num_tokens is not None else max_seq_len
) + self.num_extra_kv_tokens
typical_step = BatchDesc(
kv_caches=[
KVCacheDesc(capacity=ctx_capacity, history_length=0),
Expand All @@ -494,7 +496,16 @@ def _add_layer(
# Constraint 2: general / chunked-prefill warmup — one fresh context request
# at max_num_tokens (the per-iteration token budget).
if max_num_tokens is not None:
constraints.append(BatchDesc([KVCacheDesc(capacity=max_num_tokens, history_length=0)]))
constraints.append(
BatchDesc(
[
KVCacheDesc(
capacity=max_num_tokens + self.num_extra_kv_tokens,
history_length=0,
)
]
)
)

return KVCacheManagerConfigPy(
tokens_per_block=tokens_per_block,
Expand Down
4 changes: 3 additions & 1 deletion tensorrt_llm/_torch/attention_backend/triton_prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@
import triton
import triton.language as tl

CUDA_CAPABILITY = torch.cuda.get_device_capability()
# Guard on a visible GPU so `import tensorrt_llm` stays GPU-free under
# CUDA_VISIBLE_DEVICES="" (pure client); (0, 0) is a safe sentinel there.
CUDA_CAPABILITY = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0)


def _get_block_sizes(Lq: int, Lv: int):
Expand Down
8 changes: 7 additions & 1 deletion tensorrt_llm/_torch/cuda_tile_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,13 @@ def ceil_div(a, b):
except ImportError:
logger.warning("cuda-tile package not found, TileIR kernels will not be available")
else:
if (cc := torch.cuda.get_device_properties()) and (cc.major, cc.minor) < (10, 0):
# Guard the device-properties probe: with no visible GPU (e.g.
# CUDA_VISIBLE_DEVICES="" on a pure client) it would force a CUDA
# context just from `import tensorrt_llm`. TileIR stays unavailable,
# which is correct for a GPU-less process.
if not torch.cuda.is_available():
logger.warning("No CUDA device visible, TileIR kernels will not be available")
elif (cc := torch.cuda.get_device_properties()) and (cc.major, cc.minor) < (10, 0):
logger.warning(
f"TileIR requires compute capability 10.0 or higher, but the current device has "
f"{cc.major}.{cc.minor}. TileIR kernels will not be available"
Expand Down
1 change: 1 addition & 0 deletions tests/integration/defs/accuracy/accuracy_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ def evaluate(self,
f"Hypothesis testing report:\n{hypothesis_testing_params.report(score)}"
)
hypothesis_testing_params.assert_passing(score)
return score


class VoxPopuli(AccuracyTask):
Expand Down
13 changes: 13 additions & 0 deletions tests/integration/defs/accuracy/references/gsm8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,19 @@ deepseek-ai/DeepSeek-V4-Flash:
# 95.11 reference still holds for the hypothesis test.
- quant_algo: FP8_BLOCK_SCALES
accuracy: 95.11
- quant_algo: FP8_BLOCK_SCALES
spec_dec_algo: MTP
accuracy: 95.11
deepseek-ai/DeepSeek-V4-Pro:
# Full GSM8K aggregate gate for the Pro deployment path: TP=8, EP=8,
# attention DP, TRTLLM MoE, FP8 KV cache, MTP max_draft_len=1, padded CUDA
# graphs, custom DeepSeek-V4 tokenizer, and the same system prompt used by
# the Pro GSM8K bench script. Measured 96.32 on the full Pro aggregate run
# (GSM8K, 1319 samples); floor set slightly below for run-to-run margin.
- quant_algo: FP8_BLOCK_SCALES
kv_cache_quant_algo: FP8
spec_dec_algo: MTP
accuracy: 96.0
Qwen3/Qwen3-4B:
- spec_dec_algo: Eagle
accuracy: 85.823
Expand Down
187 changes: 187 additions & 0 deletions tests/integration/defs/accuracy/test_disaggregated_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def result(self):
DEFAULT_SERVER_WAITING_TIMEOUT = 2100
# Timeout for the accuracy evaluation
DEFAULT_ACC_EVALUATION_TIMEOUT = 1500
DEEPSEEKV4_TEST_MAX_BATCH_SIZE = 128


@functools.lru_cache(maxsize=1)
Expand Down Expand Up @@ -2324,3 +2325,189 @@ def test_auto_dtype(self, use_py_transceiver, mocker):
with launch_disaggregated_llm(disagg_cfg, ctx_cfg, gen_cfg,
self.MODEL_PATH) as llm:
run_accuracy_test(llm, self.MODEL_NAME, ["GSM8K"])


@pytest.mark.timeout(DEFAULT_TEST_TIMEOUT)
@skip_pre_blackwell
class TestDeepSeekV4Flash(LlmapiAccuracyTestHarness):
MODEL_NAME = "deepseek-ai/DeepSeek-V4-Flash"
MODEL_PATH = f"{llm_models_root()}/DeepSeek-V4-Flash"

@pytest.mark.skip_less_device(4)
def test_auto_dtype(self):
# Disagg smoke test: CTX TP=2 + GEN TP=2 = 4 GPUs.
# NVFP4 weights ~71 GB/rank at TP=2, leaving ~107 GB for KV on B200.
# TRTLLM backend required (WIDEEP lacks MXFP4 support for V4-Flash).
# V4 uses pure-Python KVCacheManagerV2; needs Python transceiver.
# NIXL (not DEFAULT) skips the TRTLLM_USE_UCX_KVCACHE=1 fallback.
cache_transceiver_config = {
"backend": "NIXL",
"transceiver_runtime": "PYTHON",
"max_tokens_in_buffer": 4096,
}
ctx_server_config = {
"tensor_parallel_size": 2,
"moe_expert_parallel_size": 2,
"disable_overlap_scheduler": True,
"max_batch_size": DEEPSEEKV4_TEST_MAX_BATCH_SIZE,
"max_seq_len": 4096,
"kv_cache_config": {
"free_gpu_memory_fraction": 0.5,
},
"cache_transceiver_config": cache_transceiver_config,
}
gen_server_config = {
"tensor_parallel_size": 2,
"moe_expert_parallel_size": 2,
"enable_attention_dp": True,
"disable_overlap_scheduler": True,
"max_batch_size": DEEPSEEKV4_TEST_MAX_BATCH_SIZE,
"max_seq_len": 4096,
"moe_config": {
"backend": "TRTLLM",
},
"kv_cache_config": {
"free_gpu_memory_fraction": 0.5,
},
"cache_transceiver_config": cache_transceiver_config,
}
disaggregated_server_config = {
"hostname": "localhost",
"backend": "pytorch",
"context_servers": {
"num_instances": 1
},
"generation_servers": {
"num_instances": 1
},
}
# V4-Flash 148GB weight prefetch + warmup needs >35 min, default wait timeout times out.
with launch_disaggregated_llm(disaggregated_server_config,
ctx_server_config,
gen_server_config,
self.MODEL_PATH,
server_waiting_timeout=3600) as llm:
task = MMLU(self.MODEL_NAME)
task.evaluate(llm, is_integration_test=True)

@pytest.mark.skip_less_device(4)
def test_gen_first(self):
"""Gen-first quick validation for DSv4-Flash on KVCacheManagerV2 + NIXL python."""
cache_transceiver_config = {
"backend": "NIXL",
"transceiver_runtime": "PYTHON",
"max_tokens_in_buffer": 4096,
}
ctx_server_config = {
"tensor_parallel_size": 2,
"moe_expert_parallel_size": 2,
"disable_overlap_scheduler": True,
"max_batch_size": DEEPSEEKV4_TEST_MAX_BATCH_SIZE,
"max_seq_len": 4096,
"kv_cache_config": {
"free_gpu_memory_fraction": 0.5,
},
"cache_transceiver_config": cache_transceiver_config,
}
gen_server_config = {
"tensor_parallel_size": 2,
"moe_expert_parallel_size": 2,
"enable_attention_dp": True,
"disable_overlap_scheduler": True,
"max_batch_size": DEEPSEEKV4_TEST_MAX_BATCH_SIZE,
"max_seq_len": 4096,
"moe_config": {
"backend": "TRTLLM",
},
"kv_cache_config": {
"free_gpu_memory_fraction": 0.5,
},
"cache_transceiver_config": cache_transceiver_config,
}
disaggregated_server_config = {
"hostname": "localhost",
"backend": "pytorch",
"context_servers": {
"num_instances": 1
},
"generation_servers": {
"num_instances": 1
},
"schedule_style": "generation_first",
}
with launch_disaggregated_llm(disaggregated_server_config,
ctx_server_config,
gen_server_config,
self.MODEL_PATH,
server_waiting_timeout=3600) as llm:
task = MMLU(self.MODEL_NAME)
task.evaluate(llm, is_integration_test=True)


@pytest.mark.timeout(14400)
@skip_pre_blackwell
@pytest.mark.skip_less_device_memory(140000)
class TestDeepSeekV4FlashBase(LlmapiAccuracyTestHarness):
MODEL_NAME = "deepseek-ai/DeepSeek-V4-Flash-Base"
MODEL_PATH = f"{llm_models_root()}/DeepSeek-V4-Flash-Base"

@pytest.mark.skip_less_device(4)
def test_auto_dtype(self):
# Disagg smoke test: CTX TP=2 + GEN TP=2 = 4 GPUs.
# FP8 weights ~71 GB/rank at TP=4 → ~142 GB/rank at TP=2; requires
# ≥140 GB per GPU (fits on B300 288 GB, tight on B200 178 GB).
# TRTLLM backend: WIDEEP's FP8 block-scale path is Hopper-only.
# Compact batching keeps KV cache ~1 GB/rank (default ~100 GB requires fully-clean GPU memory).
# V4 uses pure-Python KVCacheManagerV2; needs Python transceiver.
# NIXL (not DEFAULT) skips the TRTLLM_USE_UCX_KVCACHE=1 fallback.
cache_transceiver_config = {
"backend": "NIXL",
"transceiver_runtime": "PYTHON",
"max_tokens_in_buffer": 4096,
}
ctx_server_config = {
"tensor_parallel_size": 2,
"moe_expert_parallel_size": 2,
"disable_overlap_scheduler": True,
"max_batch_size": 16,
"max_num_tokens": 4096,
"max_seq_len": 4096,
"kv_cache_config": {
"free_gpu_memory_fraction": 0.5,
},
"cache_transceiver_config": cache_transceiver_config,
}
gen_server_config = {
"tensor_parallel_size": 2,
"moe_expert_parallel_size": 2,
"enable_attention_dp": True,
"disable_overlap_scheduler": True,
"max_batch_size": DEEPSEEKV4_TEST_MAX_BATCH_SIZE,
"max_num_tokens": 4096,
"max_seq_len": 4096,
"moe_config": {
"backend": "TRTLLM",
},
"kv_cache_config": {
"free_gpu_memory_fraction": 0.5,
},
"cache_transceiver_config": cache_transceiver_config,
}
disaggregated_server_config = {
"hostname": "localhost",
"backend": "pytorch",
"context_servers": {
"num_instances": 1
},
"generation_servers": {
"num_instances": 1
},
}
# Same long-init reason as TestDeepSeekV4Flash above.
with launch_disaggregated_llm(disaggregated_server_config,
ctx_server_config,
gen_server_config,
self.MODEL_PATH,
server_waiting_timeout=3600) as llm:
task = MMLU(self.MODEL_NAME)
task.evaluate(llm, is_integration_test=True)
Loading
Loading