NVIDIA · lfr-0531 · Jun 29, 2026 · Jul 2, 2026 · Jul 2, 2026 · Jul 2, 2026
@@ -34,9 +34,12 @@
 except RuntimeError:
     # Override TORCH_CUDA_ARCH_LIST for JIT compilation of flashinfer kernels
     # since the existed TORCH_CUDA_ARCH_LIST may be too general and flashinfer requires sm75+.
-    capability = torch.cuda.get_device_capability()
-    arch_list = f"{capability[0]}.{capability[1]}"
-    os.environ["TORCH_CUDA_ARCH_LIST"] = arch_list
+    # Guard on a visible GPU: with CUDA_VISIBLE_DEVICES="" (pure client) the
+    # capability query would force a CUDA context at import time.
+    if torch.cuda.is_available():
+        capability = torch.cuda.get_device_capability()
+        arch_list = f"{capability[0]}.{capability[1]}"
+        os.environ["TORCH_CUDA_ARCH_LIST"] = arch_list
 
 from tensorrt_llm._utils import prefer_pinned
 

@@ -472,7 +472,9 @@ def _add_layer(
         # the typical step. An all-generation typical_step over-provisions the
         # compressed-cache pool at the expense of the SWA pool, starving the
         # SWA pool and artificially capping the achievable batch size.
-        ctx_capacity = max_num_tokens if max_num_tokens is not None else max_seq_len
+        ctx_capacity = (
+            max_num_tokens if max_num_tokens is not None else max_seq_len
+        ) + self.num_extra_kv_tokens
         typical_step = BatchDesc(
             kv_caches=[
                 KVCacheDesc(capacity=ctx_capacity, history_length=0),
@@ -494,7 +496,16 @@ def _add_layer(
         # Constraint 2: general / chunked-prefill warmup — one fresh context request
         # at max_num_tokens (the per-iteration token budget).
         if max_num_tokens is not None:
-            constraints.append(BatchDesc([KVCacheDesc(capacity=max_num_tokens, history_length=0)]))
+            constraints.append(
+                BatchDesc(
+                    [
+                        KVCacheDesc(
+                            capacity=max_num_tokens + self.num_extra_kv_tokens,
+                            history_length=0,
+                        )
+                    ]
+                )
+            )
 
         return KVCacheManagerConfigPy(
             tokens_per_block=tokens_per_block,

@@ -33,7 +33,9 @@
 import triton
 import triton.language as tl
 
-CUDA_CAPABILITY = torch.cuda.get_device_capability()
+# Guard on a visible GPU so `import tensorrt_llm` stays GPU-free under
+# CUDA_VISIBLE_DEVICES="" (pure client); (0, 0) is a safe sentinel there.
+CUDA_CAPABILITY = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0)
 
 
 def _get_block_sizes(Lq: int, Lv: int):

@@ -50,7 +50,13 @@ def ceil_div(a, b):
     except ImportError:
         logger.warning("cuda-tile package not found, TileIR kernels will not be available")
     else:
-        if (cc := torch.cuda.get_device_properties()) and (cc.major, cc.minor) < (10, 0):
+        # Guard the device-properties probe: with no visible GPU (e.g.
+        # CUDA_VISIBLE_DEVICES="" on a pure client) it would force a CUDA
+        # context just from `import tensorrt_llm`. TileIR stays unavailable,
+        # which is correct for a GPU-less process.
+        if not torch.cuda.is_available():
+            logger.warning("No CUDA device visible, TileIR kernels will not be available")
+        elif (cc := torch.cuda.get_device_properties()) and (cc.major, cc.minor) < (10, 0):
             logger.warning(
                 f"TileIR requires compute capability 10.0 or higher, but the current device has "
                 f"{cc.major}.{cc.minor}. TileIR kernels will not be available"

@@ -261,6 +261,7 @@ def evaluate(self,
             f"Hypothesis testing report:\n{hypothesis_testing_params.report(score)}"
         )
         hypothesis_testing_params.assert_passing(score)
+        return score
 
 
 class VoxPopuli(AccuracyTask):

@@ -140,6 +140,19 @@ deepseek-ai/DeepSeek-V4-Flash:
   #   95.11 reference still holds for the hypothesis test.
   - quant_algo: FP8_BLOCK_SCALES
     accuracy: 95.11
+  - quant_algo: FP8_BLOCK_SCALES
+    spec_dec_algo: MTP
+    accuracy: 95.11
+deepseek-ai/DeepSeek-V4-Pro:
+  # Full GSM8K aggregate gate for the Pro deployment path: TP=8, EP=8,
+  # attention DP, TRTLLM MoE, FP8 KV cache, MTP max_draft_len=1, padded CUDA
+  # graphs, custom DeepSeek-V4 tokenizer, and the same system prompt used by
+  # the Pro GSM8K bench script. Measured 96.32 on the full Pro aggregate run
+  # (GSM8K, 1319 samples); floor set slightly below for run-to-run margin.
+  - quant_algo: FP8_BLOCK_SCALES
+    kv_cache_quant_algo: FP8
+    spec_dec_algo: MTP
+    accuracy: 96.0
 Qwen3/Qwen3-4B:
   - spec_dec_algo: Eagle
     accuracy: 85.823

@@ -54,6 +54,7 @@ def result(self):
 DEFAULT_SERVER_WAITING_TIMEOUT = 2100
 # Timeout for the accuracy evaluation
 DEFAULT_ACC_EVALUATION_TIMEOUT = 1500
+DEEPSEEKV4_TEST_MAX_BATCH_SIZE = 128
 
 
 @functools.lru_cache(maxsize=1)
@@ -2324,3 +2325,189 @@ def test_auto_dtype(self, use_py_transceiver, mocker):
         with launch_disaggregated_llm(disagg_cfg, ctx_cfg, gen_cfg,
                                       self.MODEL_PATH) as llm:
             run_accuracy_test(llm, self.MODEL_NAME, ["GSM8K"])
+
+
+@pytest.mark.timeout(DEFAULT_TEST_TIMEOUT)
+@skip_pre_blackwell
+class TestDeepSeekV4Flash(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "deepseek-ai/DeepSeek-V4-Flash"
+    MODEL_PATH = f"{llm_models_root()}/DeepSeek-V4-Flash"
+
+    @pytest.mark.skip_less_device(4)
+    def test_auto_dtype(self):
+        # Disagg smoke test: CTX TP=2 + GEN TP=2 = 4 GPUs.
+        # NVFP4 weights ~71 GB/rank at TP=2, leaving ~107 GB for KV on B200.
+        # TRTLLM backend required (WIDEEP lacks MXFP4 support for V4-Flash).
+        # V4 uses pure-Python KVCacheManagerV2; needs Python transceiver.
+        # NIXL (not DEFAULT) skips the TRTLLM_USE_UCX_KVCACHE=1 fallback.
+        cache_transceiver_config = {
+            "backend": "NIXL",
+            "transceiver_runtime": "PYTHON",
+            "max_tokens_in_buffer": 4096,
+        }
+        ctx_server_config = {
+            "tensor_parallel_size": 2,
+            "moe_expert_parallel_size": 2,
+            "disable_overlap_scheduler": True,
+            "max_batch_size": DEEPSEEKV4_TEST_MAX_BATCH_SIZE,
+            "max_seq_len": 4096,
+            "kv_cache_config": {
+                "free_gpu_memory_fraction": 0.5,
+            },
+            "cache_transceiver_config": cache_transceiver_config,
+        }
+        gen_server_config = {
+            "tensor_parallel_size": 2,
+            "moe_expert_parallel_size": 2,
+            "enable_attention_dp": True,
+            "disable_overlap_scheduler": True,
+            "max_batch_size": DEEPSEEKV4_TEST_MAX_BATCH_SIZE,
+            "max_seq_len": 4096,
+            "moe_config": {
+                "backend": "TRTLLM",
+            },
+            "kv_cache_config": {
+                "free_gpu_memory_fraction": 0.5,
+            },
+            "cache_transceiver_config": cache_transceiver_config,
+        }
+        disaggregated_server_config = {
+            "hostname": "localhost",
+            "backend": "pytorch",
+            "context_servers": {
+                "num_instances": 1
+            },
+            "generation_servers": {
+                "num_instances": 1
+            },
+        }
+        # V4-Flash 148GB weight prefetch + warmup needs >35 min, default wait timeout times out.
+        with launch_disaggregated_llm(disaggregated_server_config,
+                                      ctx_server_config,
+                                      gen_server_config,
+                                      self.MODEL_PATH,
+                                      server_waiting_timeout=3600) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm, is_integration_test=True)
+
+    @pytest.mark.skip_less_device(4)
+    def test_gen_first(self):
+        """Gen-first quick validation for DSv4-Flash on KVCacheManagerV2 + NIXL python."""
+        cache_transceiver_config = {
+            "backend": "NIXL",
+            "transceiver_runtime": "PYTHON",
+            "max_tokens_in_buffer": 4096,
+        }
+        ctx_server_config = {
+            "tensor_parallel_size": 2,
+            "moe_expert_parallel_size": 2,
+            "disable_overlap_scheduler": True,
+            "max_batch_size": DEEPSEEKV4_TEST_MAX_BATCH_SIZE,
+            "max_seq_len": 4096,
+            "kv_cache_config": {
+                "free_gpu_memory_fraction": 0.5,
+            },
+            "cache_transceiver_config": cache_transceiver_config,
+        }
+        gen_server_config = {
+            "tensor_parallel_size": 2,
+            "moe_expert_parallel_size": 2,
+            "enable_attention_dp": True,
+            "disable_overlap_scheduler": True,
+            "max_batch_size": DEEPSEEKV4_TEST_MAX_BATCH_SIZE,
+            "max_seq_len": 4096,
+            "moe_config": {
+                "backend": "TRTLLM",
+            },
+            "kv_cache_config": {
+                "free_gpu_memory_fraction": 0.5,
+            },
+            "cache_transceiver_config": cache_transceiver_config,
+        }
+        disaggregated_server_config = {
+            "hostname": "localhost",
+            "backend": "pytorch",
+            "context_servers": {
+                "num_instances": 1
+            },
+            "generation_servers": {
+                "num_instances": 1
+            },
+            "schedule_style": "generation_first",
+        }
+        with launch_disaggregated_llm(disaggregated_server_config,
+                                      ctx_server_config,
+                                      gen_server_config,
+                                      self.MODEL_PATH,
+                                      server_waiting_timeout=3600) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm, is_integration_test=True)
+
+
+@pytest.mark.timeout(14400)
+@skip_pre_blackwell
+@pytest.mark.skip_less_device_memory(140000)
+class TestDeepSeekV4FlashBase(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "deepseek-ai/DeepSeek-V4-Flash-Base"
+    MODEL_PATH = f"{llm_models_root()}/DeepSeek-V4-Flash-Base"
+
+    @pytest.mark.skip_less_device(4)
+    def test_auto_dtype(self):
+        # Disagg smoke test: CTX TP=2 + GEN TP=2 = 4 GPUs.
+        # FP8 weights ~71 GB/rank at TP=4 → ~142 GB/rank at TP=2; requires
+        # ≥140 GB per GPU (fits on B300 288 GB, tight on B200 178 GB).
+        # TRTLLM backend: WIDEEP's FP8 block-scale path is Hopper-only.
+        # Compact batching keeps KV cache ~1 GB/rank (default ~100 GB requires fully-clean GPU memory).
+        # V4 uses pure-Python KVCacheManagerV2; needs Python transceiver.
+        # NIXL (not DEFAULT) skips the TRTLLM_USE_UCX_KVCACHE=1 fallback.
+        cache_transceiver_config = {
+            "backend": "NIXL",
+            "transceiver_runtime": "PYTHON",
+            "max_tokens_in_buffer": 4096,
+        }
+        ctx_server_config = {
+            "tensor_parallel_size": 2,
+            "moe_expert_parallel_size": 2,
+            "disable_overlap_scheduler": True,
+            "max_batch_size": 16,
+            "max_num_tokens": 4096,
+            "max_seq_len": 4096,
+            "kv_cache_config": {
+                "free_gpu_memory_fraction": 0.5,
+            },
+            "cache_transceiver_config": cache_transceiver_config,
+        }
+        gen_server_config = {
+            "tensor_parallel_size": 2,
+            "moe_expert_parallel_size": 2,
+            "enable_attention_dp": True,
+            "disable_overlap_scheduler": True,
+            "max_batch_size": DEEPSEEKV4_TEST_MAX_BATCH_SIZE,
+            "max_num_tokens": 4096,
+            "max_seq_len": 4096,
+            "moe_config": {
+                "backend": "TRTLLM",
+            },
+            "kv_cache_config": {
+                "free_gpu_memory_fraction": 0.5,
+            },
+            "cache_transceiver_config": cache_transceiver_config,
+        }
+        disaggregated_server_config = {
+            "hostname": "localhost",
+            "backend": "pytorch",
+            "context_servers": {
+                "num_instances": 1
+            },
+            "generation_servers": {
+                "num_instances": 1
+            },
+        }
+        # Same long-init reason as TestDeepSeekV4Flash above.
+        with launch_disaggregated_llm(disaggregated_server_config,
+                                      ctx_server_config,
+                                      gen_server_config,
+                                      self.MODEL_PATH,
+                                      server_waiting_timeout=3600) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm, is_integration_test=True)