[None][test] Reject AD piecewise cudagraph with speculation

govind-ramnarayan · govind-ramnarayan · commit e419a17bf191 · 2026-06-17T16:09:09.000-07:00
Signed-off-by: Govind Ramnarayan &lt;105831528+govind-ramnarayan@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/llm_args.py b/tensorrt_llm/_torch/auto_deploy/llm_args.py
@@ -443,6 +443,20 @@ def reject_cudagraph_for_speculative_flashinfer(self):
             )
         return self
 
+    @model_validator(mode="after")
+    def reject_piecewise_cuda_graph_for_speculative_decoding(self):
+        compile_model = self.transforms.get("compile_model", {})
+        if (
+            self.speculative_config is not None
+            and self.is_cuda_graph_enabled()
+            and compile_model.get("piecewise_enabled", False)
+        ):
+            raise ValueError(
+                "Speculative decoding with AutoDeploy does not currently support piecewise CUDA "
+                "graph capture."
+            )
+        return self
+
     @model_validator(mode="after")
     def disable_piecewise_for_non_piecewise_backend(self):
         compile_model = self.transforms.get("compile_model")
diff --git a/tests/unittest/auto_deploy/singlegpu/shim/test_llm_config.py b/tests/unittest/auto_deploy/singlegpu/shim/test_llm_config.py
@@ -275,6 +275,10 @@ class TestSpeculativeConfigValidation:
     Verify that supported speculative modes are accepted and configured before executor setup.
     """
 
+    @staticmethod
+    def piecewise_disabled_transforms():
+        return {"compile_model": {"piecewise_enabled": False}}
+
     def test_accepts_eagle_one_model(self):
         from tensorrt_llm.llmapi import EagleDecodingConfig
 
@@ -284,7 +288,11 @@ def test_accepts_eagle_one_model(self):
             eagle3_one_model=True,
         )
         # Should not raise.
-        args = LlmArgs(model="test-model", speculative_config=spec_config)
+        args = LlmArgs(
+            model="test-model",
+            speculative_config=spec_config,
+            transforms=self.piecewise_disabled_transforms(),
+        )
         assert args.model_factory == "eagle_one_model"
 
     def test_accepts_mtp_eagle_one_model(self):
@@ -295,7 +303,11 @@ def test_accepts_mtp_eagle_one_model(self):
             mtp_eagle_one_model=True,
         )
         # Should not raise.
-        args = LlmArgs(model="test-model", speculative_config=spec_config)
+        args = LlmArgs(
+            model="test-model",
+            speculative_config=spec_config,
+            transforms=self.piecewise_disabled_transforms(),
+        )
         assert args.model_factory == "eagle_one_model"
 
     @pytest.mark.parametrize("compile_backend", ["torch-cudagraph", "torch-opt"])
@@ -356,7 +368,10 @@ def test_ssm_replay_with_spec_ok(self):
         args = LlmArgs(
             model="test-model",
             speculative_config=spec_config,
-            transforms={"insert_cached_ssm_attention": {"ssm_replay": True}},
+            transforms={
+                "compile_model": {"piecewise_enabled": False},
+                "insert_cached_ssm_attention": {"ssm_replay": True},
+            },
         )
         assert args.transforms["insert_cached_ssm_attention"]["ssm_replay"] is True
 
diff --git a/tests/unittest/auto_deploy/singlegpu/smoke/test_ad_speculative_decoding.py b/tests/unittest/auto_deploy/singlegpu/smoke/test_ad_speculative_decoding.py
@@ -45,6 +45,10 @@ def get_extra_seq_len_for_kv_cache(llm_args) -> int:
     return extra
 
 
+def piecewise_disabled_transforms():
+    return {"compile_model": {"piecewise_enabled": False}}
+
+
 def test_super_mtp_smoke():
     """Test one-model MTP/Eagle runtime with a tiny Nemotron SuperV3 target."""
     test_prompt = "What is the capital of France?"
@@ -190,6 +194,7 @@ def test_kv_cache_extra_seq_len_for_spec_dec():
         model="meta-llama/Meta-Llama-3.1-8B-Instruct",
         speculative_config=spec_config,
         disable_overlap_scheduler=True,
+        transforms=piecewise_disabled_transforms(),
     )
     extra = get_extra_seq_len_for_kv_cache(args_eagle)
     # Should include max_total_draft_tokens + get_num_extra_kv_tokens (max_draft_len - 1)
@@ -201,6 +206,7 @@ def test_kv_cache_extra_seq_len_for_spec_dec():
         model="meta-llama/Meta-Llama-3.1-8B-Instruct",
         speculative_config=spec_config,
         disable_overlap_scheduler=False,
+        transforms=piecewise_disabled_transforms(),
     )
     extra_overlap = get_extra_seq_len_for_kv_cache(args_eagle_overlap)
     # Should be more than without overlap
@@ -217,6 +223,7 @@ def test_mtp_autodeploy_uses_eagle_one_model_capture():
             num_nextn_predict_layers=3,
             mtp_eagle_one_model=True,
         ),
+        transforms=piecewise_disabled_transforms(),
     )
 
     assert isinstance(args.speculative_config, MTPDecodingConfig)
@@ -229,6 +236,9 @@ def test_detect_hidden_states_capture_last_layer_for_mtp_eagle_one_model():
     from tensorrt_llm._torch.auto_deploy.llm_args import LlmArgs
 
     config = get_small_model_config("meta-llama/Meta-Llama-3.1-8B-Instruct")
+    config["args"].setdefault("transforms", {}).setdefault("compile_model", {})[
+        "piecewise_enabled"
+    ] = False
 
     args = LlmArgs(
         **config["args"],