From 43aed4ffdf754590c2d0c9a47a722b396e0e80ff Mon Sep 17 00:00:00 2001
From: Govind Ramnarayan <105831528+govind-ramnarayan@users.noreply.github.com>
Date: Fri, 12 Jun 2026 15:46:11 -0700
Subject: [PATCH 1/5] [https://nvbugs/6301621][test] Re-enable AutoDeploy
 disagg tests

Signed-off-by: Govind Ramnarayan <105831528+govind-ramnarayan@users.noreply.github.com>
---
 .../defs/disaggregated/test_ad_disagg.py      | 12 +++++++++-
 .../test_ad_disagg_trtllm_serve.py            | 12 +++++++++-
 tests/integration/test_lists/waives.txt       | 23 -------------------
 3 files changed, 22 insertions(+), 25 deletions(-)

diff --git a/tests/integration/defs/disaggregated/test_ad_disagg.py b/tests/integration/defs/disaggregated/test_ad_disagg.py
index c74a38316066..9308c5011532 100644
--- a/tests/integration/defs/disaggregated/test_ad_disagg.py
+++ b/tests/integration/defs/disaggregated/test_ad_disagg.py
@@ -25,7 +25,7 @@
 import cloudpickle
 import pytest
 import torch
-from defs.conftest import get_sm_version, skip_pre_hopper
+from defs.conftest import check_device_contain, get_sm_version, skip_pre_hopper
 from mpi4py import MPI
 from mpi4py.futures import MPIPoolExecutor
 
@@ -41,6 +41,16 @@
     pickle.HIGHEST_PROTOCOL,
 )
 
+
+@pytest.fixture(autouse=True)
+def skip_b300():
+    if check_device_contain(["B300"]):
+        pytest.skip(
+            "AutoDeploy disagg tests are disabled on B300/GB300 until capacity is available: "
+            "https://nvbugs/6301621"
+        )
+
+
 WORKER_READY = "ready"
 REQUEST_MODE_AGGREGATE = "aggregate"
 MPI_REQUEST = 9999
diff --git a/tests/integration/defs/disaggregated/test_ad_disagg_trtllm_serve.py b/tests/integration/defs/disaggregated/test_ad_disagg_trtllm_serve.py
index 7c3dbc35f099..9698e6799ef9 100644
--- a/tests/integration/defs/disaggregated/test_ad_disagg_trtllm_serve.py
+++ b/tests/integration/defs/disaggregated/test_ad_disagg_trtllm_serve.py
@@ -20,7 +20,7 @@
 import pytest
 import requests
 from defs.common import get_free_port_in_ci as get_free_port
-from defs.conftest import llm_models_root
+from defs.conftest import check_device_contain, llm_models_root
 from disagg_test_utils import (
     CHECK_STATUS_INTERVAL,
     HEARTBEAT_INTERVAL,
@@ -34,6 +34,16 @@
 
 pytest_plugins = ["disagg_test_utils"]
 
+
+@pytest.fixture(autouse=True)
+def skip_b300():
+    if check_device_contain(["B300"]):
+        pytest.skip(
+            "AutoDeploy disagg tests are disabled on B300/GB300 until capacity is available: "
+            "https://nvbugs/6301621"
+        )
+
+
 SERVER_START_TIMEOUT_S = 300
 SERVER_READY_REQUEST_TIMEOUT_S = 5
 OPENAI_REQUEST_TIMEOUT_S = 60
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index e06ce578ea92..c3492d0ed247 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -100,18 +100,6 @@ cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache
 cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-nixl_kvcache-90] SKIP (https://nvbugs/6093820)
 cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-ucx_kvcache-90] SKIP (https://nvbugs/6093820)
 cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5838199)
-disaggregated/test_ad_disagg.py::test_async_eagle3_full_model_handoff SKIP (https://nvbugs/6306936)
-disaggregated/test_ad_disagg.py::test_async_generation_matches_aggregate SKIP (https://nvbugs/6306936)
-disaggregated/test_ad_disagg.py::test_async_generation_no_overlap_matches_aggregate SKIP (https://nvbugs/6306936)
-disaggregated/test_ad_disagg.py::test_async_sharded_generation_handoff SKIP (https://nvbugs/6306936)
-disaggregated/test_ad_disagg.py::test_chunked_prefill_handoff[deepseek_v3_mla] SKIP (https://nvbugs/6306936)
-disaggregated/test_ad_disagg.py::test_chunked_prefill_handoff[tinyllama] SKIP (https://nvbugs/6306936)
-disaggregated/test_ad_disagg.py::test_disaggregated_logits[deepseek_v3_mla] SKIP (https://nvbugs/6306936)
-disaggregated/test_ad_disagg.py::test_disaggregated_logits[tinyllama] SKIP (https://nvbugs/6306936)
-disaggregated/test_ad_disagg.py::test_reduced_layer_handoff_matches_aggregate[deepseek_v3_mla] SKIP (https://nvbugs/6306936)
-disaggregated/test_ad_disagg.py::test_reduced_layer_handoff_matches_aggregate[tinyllama] SKIP (https://nvbugs/6306936)
-disaggregated/test_ad_disagg.py::test_tinyllama_batch_handoff_semantic_slots SKIP (https://nvbugs/6306936)
-disaggregated/test_ad_disagg_trtllm_serve.py::test_openai_completion SKIP (https://nvbugs/6306936)
 disaggregated/test_disaggregated.py::test_disaggregated_cancel_large_context_requests[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6105768)
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_cache_aware_balance[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6162322)
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_conditional[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6162322)
@@ -465,17 +453,6 @@ unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_
 unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingRenormalize_qwen_next-swiglu-1024-1024-150] SKIP (https://nvbugs/5908070)
 unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingRenormalize_topk_4-swiglu-1024-1024-150] SKIP (https://nvbugs/5908070)
 unittest/_torch/visual_gen/test_flux_pipeline.py::TestFluxCombinedOptimizations::test_all_optimizations_combined SKIP (https://nvbugs/6199854)
-unittest/auto_deploy/singlegpu/smoke SKIP (https://nvbugs/6306936)
-unittest/auto_deploy/singlegpu/smoke/test_disagg.py::test_autodeploy_disaggregated_batch_smoke[deepseek-trtllm-simple] SKIP (https://nvbugs/6307525)
-unittest/auto_deploy/singlegpu/smoke/test_disagg.py::test_autodeploy_disaggregated_batch_smoke[llama-flashinfer-cudagraph] SKIP (https://nvbugs/6307525)
-unittest/auto_deploy/singlegpu/smoke/test_disagg.py::test_autodeploy_disaggregated_batch_smoke[llama-flashinfer-simple] SKIP (https://nvbugs/6307525)
-unittest/auto_deploy/singlegpu/smoke/test_disagg.py::test_autodeploy_disaggregated_batch_smoke[llama-trtllm-cudagraph] SKIP (https://nvbugs/6307525)
-unittest/auto_deploy/singlegpu/smoke/test_disagg.py::test_autodeploy_disaggregated_batch_smoke[llama-trtllm-simple] SKIP (https://nvbugs/6307525)
-unittest/auto_deploy/singlegpu/smoke/test_disagg.py::test_autodeploy_disaggregated_smoke[deepseek-trtllm-simple] SKIP (https://nvbugs/6307525)
-unittest/auto_deploy/singlegpu/smoke/test_disagg.py::test_autodeploy_disaggregated_smoke[llama-flashinfer-cudagraph] SKIP (https://nvbugs/6307525)
-unittest/auto_deploy/singlegpu/smoke/test_disagg.py::test_autodeploy_disaggregated_smoke[llama-flashinfer-simple] SKIP (https://nvbugs/6307525)
-unittest/auto_deploy/singlegpu/smoke/test_disagg.py::test_autodeploy_disaggregated_smoke[llama-trtllm-cudagraph] SKIP (https://nvbugs/6307525)
-unittest/auto_deploy/singlegpu/smoke/test_disagg.py::test_autodeploy_disaggregated_smoke[llama-trtllm-simple] SKIP (https://nvbugs/6307525)
 unittest/bindings/test_transfer_agent_bindings.py::TestNixlFunctionalTransfer::test_nixl_wait_in_progress_on_zero_timeout SKIP (https://nvbugs/6260897)
 unittest/executor/test_rpc.py::TestRpcCorrectness::test_incremental_task_async SKIP (https://nvbugs/5741476)
 unittest/executor/test_rpc_proxy.py SKIP (https://nvbugs/5605741)

From 5319704fa1af202ac99fa49fba180125fa2e237f Mon Sep 17 00:00:00 2001
From: Govind Ramnarayan <105831528+govind-ramnarayan@users.noreply.github.com>
Date: Wed, 17 Jun 2026 15:58:56 -0700
Subject: [PATCH 2/5] [None][test] Disable piecewise cudagraph for speculative
 AD tests

Signed-off-by: Govind Ramnarayan <105831528+govind-ramnarayan@users.noreply.github.com>
---
 .../model_registry/configs/super_v3_mtp.yaml         |  3 +++
 .../defs/accuracy/test_llm_api_autodeploy.py         | 12 +++++++++---
 .../integration/defs/disaggregated/test_ad_disagg.py |  8 ++++++++
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/examples/auto_deploy/model_registry/configs/super_v3_mtp.yaml b/examples/auto_deploy/model_registry/configs/super_v3_mtp.yaml
index c3b5911ff956..d1331190335c 100644
--- a/examples/auto_deploy/model_registry/configs/super_v3_mtp.yaml
+++ b/examples/auto_deploy/model_registry/configs/super_v3_mtp.yaml
@@ -19,6 +19,9 @@ speculative_config:
   num_nextn_predict_layers: 6
   mtp_eagle_one_model: true
 transforms:
+  compile_model:
+    # MTP speculative decoding does not support piecewise CUDA graph capture yet.
+    piecewise_enabled: false
   detect_sharding:
     allreduce_strategy: NCCL
     # NOTE: add 'tp' to sharding dims only for high-throughput runs
diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
index 902f7addf53a..06df87b6b763 100644
--- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
+++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@@ -168,6 +168,14 @@ def low_memory_overrides(config,
     return config
 
 
+def disable_piecewise_cuda_graph_for_speculation(config: dict) -> dict:
+    """Disable piecewise CUDA graph capture for speculative AutoDeploy tests."""
+    config.setdefault("transforms",
+                      {}).setdefault("compile_model",
+                                     {})["piecewise_enabled"] = False
+    return config
+
+
 def reduced_model_kwargs(num_hidden_layers: int,
                          model_path: str | None = None) -> dict:
     """Return model_kwargs to cap a model at ``num_hidden_layers`` layers.
@@ -370,9 +378,7 @@ def get_default_kwargs(self, attn_backend="flashinfer"):
                 "torch_dtype": "bfloat16"
             },
         }
-        kwargs.setdefault("transforms",
-                          {}).setdefault("compile_model",
-                                         {})["piecewise_enabled"] = False
+        disable_piecewise_cuda_graph_for_speculation(kwargs)
 
         return kwargs
 
diff --git a/tests/integration/defs/disaggregated/test_ad_disagg.py b/tests/integration/defs/disaggregated/test_ad_disagg.py
index 9308c5011532..cbd743f6a499 100644
--- a/tests/integration/defs/disaggregated/test_ad_disagg.py
+++ b/tests/integration/defs/disaggregated/test_ad_disagg.py
@@ -153,6 +153,12 @@ def seed_disagg():
         torch.cuda.manual_seed_all(AUTODEPLOY_DISAGG_SEED)
 
 
+def disable_piecewise_cuda_graph_for_speculation(config: dict) -> dict:
+    """Disable piecewise CUDA graph capture for speculative AutoDeploy tests."""
+    config.setdefault("transforms", {}).setdefault("compile_model", {})["piecewise_enabled"] = False
+    return config
+
+
 def base_config(extra_config=None):
     common_config = dict(
         runtime="trtllm",
@@ -167,6 +173,8 @@ def base_config(extra_config=None):
     )
     if extra_config:
         common_config.update(extra_config)
+    if common_config.get("speculative_config") is not None:
+        disable_piecewise_cuda_graph_for_speculation(common_config)
 
     return common_config
 

From 235217e144e964e62617979809c7e76426f35b0e Mon Sep 17 00:00:00 2001
From: Govind Ramnarayan <105831528+govind-ramnarayan@users.noreply.github.com>
Date: Wed, 17 Jun 2026 16:09:09 -0700
Subject: [PATCH 3/5] [None][test] Reject AD piecewise cudagraph with
 speculation

Signed-off-by: Govind Ramnarayan <105831528+govind-ramnarayan@users.noreply.github.com>
---
 tensorrt_llm/_torch/auto_deploy/llm_args.py   | 14 +++++++++++++
 .../singlegpu/shim/test_llm_config.py         | 21 ++++++++++++++++---
 .../smoke/test_ad_speculative_decoding.py     | 10 +++++++++
 3 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/tensorrt_llm/_torch/auto_deploy/llm_args.py b/tensorrt_llm/_torch/auto_deploy/llm_args.py
index 4b52e53aa568..afc18d3b3790 100644
--- a/tensorrt_llm/_torch/auto_deploy/llm_args.py
+++ b/tensorrt_llm/_torch/auto_deploy/llm_args.py
@@ -443,6 +443,20 @@ def reject_cudagraph_for_speculative_flashinfer(self):
             )
         return self
 
+    @model_validator(mode="after")
+    def reject_piecewise_cuda_graph_for_speculative_decoding(self):
+        compile_model = self.transforms.get("compile_model", {})
+        if (
+            self.speculative_config is not None
+            and self.is_cuda_graph_enabled()
+            and compile_model.get("piecewise_enabled", False)
+        ):
+            raise ValueError(
+                "Speculative decoding with AutoDeploy does not currently support piecewise CUDA "
+                "graph capture."
+            )
+        return self
+
     @model_validator(mode="after")
     def disable_piecewise_for_non_piecewise_backend(self):
         compile_model = self.transforms.get("compile_model")
diff --git a/tests/unittest/auto_deploy/singlegpu/shim/test_llm_config.py b/tests/unittest/auto_deploy/singlegpu/shim/test_llm_config.py
index 85553ebab42a..5639344dd18c 100644
--- a/tests/unittest/auto_deploy/singlegpu/shim/test_llm_config.py
+++ b/tests/unittest/auto_deploy/singlegpu/shim/test_llm_config.py
@@ -275,6 +275,10 @@ class TestSpeculativeConfigValidation:
     Verify that supported speculative modes are accepted and configured before executor setup.
     """
 
+    @staticmethod
+    def piecewise_disabled_transforms():
+        return {"compile_model": {"piecewise_enabled": False}}
+
     def test_accepts_eagle_one_model(self):
         from tensorrt_llm.llmapi import EagleDecodingConfig
 
@@ -284,7 +288,11 @@ def test_accepts_eagle_one_model(self):
             eagle3_one_model=True,
         )
         # Should not raise.
-        args = LlmArgs(model="test-model", speculative_config=spec_config)
+        args = LlmArgs(
+            model="test-model",
+            speculative_config=spec_config,
+            transforms=self.piecewise_disabled_transforms(),
+        )
         assert args.model_factory == "eagle_one_model"
 
     def test_accepts_mtp_eagle_one_model(self):
@@ -295,7 +303,11 @@ def test_accepts_mtp_eagle_one_model(self):
             mtp_eagle_one_model=True,
         )
         # Should not raise.
-        args = LlmArgs(model="test-model", speculative_config=spec_config)
+        args = LlmArgs(
+            model="test-model",
+            speculative_config=spec_config,
+            transforms=self.piecewise_disabled_transforms(),
+        )
         assert args.model_factory == "eagle_one_model"
 
     @pytest.mark.parametrize("compile_backend", ["torch-cudagraph", "torch-opt"])
@@ -356,7 +368,10 @@ def test_ssm_replay_with_spec_ok(self):
         args = LlmArgs(
             model="test-model",
             speculative_config=spec_config,
-            transforms={"insert_cached_ssm_attention": {"ssm_replay": True}},
+            transforms={
+                "compile_model": {"piecewise_enabled": False},
+                "insert_cached_ssm_attention": {"ssm_replay": True},
+            },
         )
         assert args.transforms["insert_cached_ssm_attention"]["ssm_replay"] is True
 
diff --git a/tests/unittest/auto_deploy/singlegpu/smoke/test_ad_speculative_decoding.py b/tests/unittest/auto_deploy/singlegpu/smoke/test_ad_speculative_decoding.py
index 90113f1ad6a0..245cc79ec1f2 100644
--- a/tests/unittest/auto_deploy/singlegpu/smoke/test_ad_speculative_decoding.py
+++ b/tests/unittest/auto_deploy/singlegpu/smoke/test_ad_speculative_decoding.py
@@ -45,6 +45,10 @@ def get_extra_seq_len_for_kv_cache(llm_args) -> int:
     return extra
 
 
+def piecewise_disabled_transforms():
+    return {"compile_model": {"piecewise_enabled": False}}
+
+
 def test_super_mtp_smoke():
     """Test one-model MTP/Eagle runtime with a tiny Nemotron SuperV3 target."""
     test_prompt = "What is the capital of France?"
@@ -190,6 +194,7 @@ def test_kv_cache_extra_seq_len_for_spec_dec():
         model="meta-llama/Meta-Llama-3.1-8B-Instruct",
         speculative_config=spec_config,
         disable_overlap_scheduler=True,
+        transforms=piecewise_disabled_transforms(),
     )
     extra = get_extra_seq_len_for_kv_cache(args_eagle)
     # Should include max_total_draft_tokens + get_num_extra_kv_tokens (max_draft_len - 1)
@@ -201,6 +206,7 @@ def test_kv_cache_extra_seq_len_for_spec_dec():
         model="meta-llama/Meta-Llama-3.1-8B-Instruct",
         speculative_config=spec_config,
         disable_overlap_scheduler=False,
+        transforms=piecewise_disabled_transforms(),
     )
     extra_overlap = get_extra_seq_len_for_kv_cache(args_eagle_overlap)
     # Should be more than without overlap
@@ -217,6 +223,7 @@ def test_mtp_autodeploy_uses_eagle_one_model_capture():
             num_nextn_predict_layers=3,
             mtp_eagle_one_model=True,
         ),
+        transforms=piecewise_disabled_transforms(),
     )
 
     assert isinstance(args.speculative_config, MTPDecodingConfig)
@@ -229,6 +236,9 @@ def test_detect_hidden_states_capture_last_layer_for_mtp_eagle_one_model():
     from tensorrt_llm._torch.auto_deploy.llm_args import LlmArgs
 
     config = get_small_model_config("meta-llama/Meta-Llama-3.1-8B-Instruct")
+    config["args"].setdefault("transforms", {}).setdefault("compile_model", {})[
+        "piecewise_enabled"
+    ] = False
 
     args = LlmArgs(
         **config["args"],

From 3e2cc4353949064f8343eb5a9eba7d0b72457900 Mon Sep 17 00:00:00 2001
From: Govind Ramnarayan <105831528+govind-ramnarayan@users.noreply.github.com>
Date: Wed, 17 Jun 2026 16:14:14 -0700
Subject: [PATCH 4/5] [None][test] Revert redundant AD accuracy helper

Signed-off-by: Govind Ramnarayan <105831528+govind-ramnarayan@users.noreply.github.com>
---
 .../defs/accuracy/test_llm_api_autodeploy.py         | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
index 06df87b6b763..902f7addf53a 100644
--- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
+++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@@ -168,14 +168,6 @@ def low_memory_overrides(config,
     return config
 
 
-def disable_piecewise_cuda_graph_for_speculation(config: dict) -> dict:
-    """Disable piecewise CUDA graph capture for speculative AutoDeploy tests."""
-    config.setdefault("transforms",
-                      {}).setdefault("compile_model",
-                                     {})["piecewise_enabled"] = False
-    return config
-
-
 def reduced_model_kwargs(num_hidden_layers: int,
                          model_path: str | None = None) -> dict:
     """Return model_kwargs to cap a model at ``num_hidden_layers`` layers.
@@ -378,7 +370,9 @@ def get_default_kwargs(self, attn_backend="flashinfer"):
                 "torch_dtype": "bfloat16"
             },
         }
-        disable_piecewise_cuda_graph_for_speculation(kwargs)
+        kwargs.setdefault("transforms",
+                          {}).setdefault("compile_model",
+                                         {})["piecewise_enabled"] = False
 
         return kwargs
 

From aa3f6a24e1340e2bc9956a1e7eef6ff85544edea Mon Sep 17 00:00:00 2001
From: Govind Ramnarayan <105831528+govind-ramnarayan@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:57:44 -0700
Subject: [PATCH 5/5] default max_seq_len for small model tests to prevent bad
 defaults from factory. Maybe we want better defaults in the future but
 production config files are agent-generated so they dont have this issue when
 they set max_seq_len

Signed-off-by: Govind Ramnarayan <105831528+govind-ramnarayan@users.noreply.github.com>
---
 tests/unittest/auto_deploy/_utils_test/_model_test_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unittest/auto_deploy/_utils_test/_model_test_utils.py b/tests/unittest/auto_deploy/_utils_test/_model_test_utils.py
index 6069e8a0526c..e7b79ce4fd86 100644
--- a/tests/unittest/auto_deploy/_utils_test/_model_test_utils.py
+++ b/tests/unittest/auto_deploy/_utils_test/_model_test_utils.py
@@ -641,6 +641,7 @@ def get_small_model_config(model_hub_id: str, **llm_args_kwargs) -> Dict[str, An
         "free_gpu_memory_fraction": 0.0,  # No resizing of the cache to keep the mem footprint small
     }
     llm_args["max_batch_size"] = 2  # Minimum batching to speed up things
+    llm_args["max_seq_len"] = 256
     llm_args["cuda_graph_config"] = {"max_batch_size": 2}  # Match max_batch_size
     # update with custom llm_args kwargs
     llm_args.update(llm_args_kwargs)