From 43aed4ffdf754590c2d0c9a47a722b396e0e80ff Mon Sep 17 00:00:00 2001 From: Govind Ramnarayan <105831528+govind-ramnarayan@users.noreply.github.com> Date: Fri, 12 Jun 2026 15:46:11 -0700 Subject: [PATCH 1/5] [https://nvbugs/6301621][test] Re-enable AutoDeploy disagg tests Signed-off-by: Govind Ramnarayan <105831528+govind-ramnarayan@users.noreply.github.com> --- .../defs/disaggregated/test_ad_disagg.py | 12 +++++++++- .../test_ad_disagg_trtllm_serve.py | 12 +++++++++- tests/integration/test_lists/waives.txt | 23 ------------------- 3 files changed, 22 insertions(+), 25 deletions(-) diff --git a/tests/integration/defs/disaggregated/test_ad_disagg.py b/tests/integration/defs/disaggregated/test_ad_disagg.py index c74a38316066..9308c5011532 100644 --- a/tests/integration/defs/disaggregated/test_ad_disagg.py +++ b/tests/integration/defs/disaggregated/test_ad_disagg.py @@ -25,7 +25,7 @@ import cloudpickle import pytest import torch -from defs.conftest import get_sm_version, skip_pre_hopper +from defs.conftest import check_device_contain, get_sm_version, skip_pre_hopper from mpi4py import MPI from mpi4py.futures import MPIPoolExecutor @@ -41,6 +41,16 @@ pickle.HIGHEST_PROTOCOL, ) + +@pytest.fixture(autouse=True) +def skip_b300(): + if check_device_contain(["B300"]): + pytest.skip( + "AutoDeploy disagg tests are disabled on B300/GB300 until capacity is available: " + "https://nvbugs/6301621" + ) + + WORKER_READY = "ready" REQUEST_MODE_AGGREGATE = "aggregate" MPI_REQUEST = 9999 diff --git a/tests/integration/defs/disaggregated/test_ad_disagg_trtllm_serve.py b/tests/integration/defs/disaggregated/test_ad_disagg_trtllm_serve.py index 7c3dbc35f099..9698e6799ef9 100644 --- a/tests/integration/defs/disaggregated/test_ad_disagg_trtllm_serve.py +++ b/tests/integration/defs/disaggregated/test_ad_disagg_trtllm_serve.py @@ -20,7 +20,7 @@ import pytest import requests from defs.common import get_free_port_in_ci as get_free_port -from defs.conftest import llm_models_root +from defs.conftest import check_device_contain, llm_models_root from disagg_test_utils import ( CHECK_STATUS_INTERVAL, HEARTBEAT_INTERVAL, @@ -34,6 +34,16 @@ pytest_plugins = ["disagg_test_utils"] + +@pytest.fixture(autouse=True) +def skip_b300(): + if check_device_contain(["B300"]): + pytest.skip( + "AutoDeploy disagg tests are disabled on B300/GB300 until capacity is available: " + "https://nvbugs/6301621" + ) + + SERVER_START_TIMEOUT_S = 300 SERVER_READY_REQUEST_TIMEOUT_S = 5 OPENAI_REQUEST_TIMEOUT_S = 60 diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index e06ce578ea92..c3492d0ed247 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -100,18 +100,6 @@ cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-nixl_kvcache-90] SKIP (https://nvbugs/6093820) cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-ucx_kvcache-90] SKIP (https://nvbugs/6093820) cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5838199) -disaggregated/test_ad_disagg.py::test_async_eagle3_full_model_handoff SKIP (https://nvbugs/6306936) -disaggregated/test_ad_disagg.py::test_async_generation_matches_aggregate SKIP (https://nvbugs/6306936) -disaggregated/test_ad_disagg.py::test_async_generation_no_overlap_matches_aggregate SKIP (https://nvbugs/6306936) -disaggregated/test_ad_disagg.py::test_async_sharded_generation_handoff SKIP (https://nvbugs/6306936) -disaggregated/test_ad_disagg.py::test_chunked_prefill_handoff[deepseek_v3_mla] SKIP (https://nvbugs/6306936) -disaggregated/test_ad_disagg.py::test_chunked_prefill_handoff[tinyllama] SKIP (https://nvbugs/6306936) -disaggregated/test_ad_disagg.py::test_disaggregated_logits[deepseek_v3_mla] SKIP (https://nvbugs/6306936) -disaggregated/test_ad_disagg.py::test_disaggregated_logits[tinyllama] SKIP (https://nvbugs/6306936) -disaggregated/test_ad_disagg.py::test_reduced_layer_handoff_matches_aggregate[deepseek_v3_mla] SKIP (https://nvbugs/6306936) -disaggregated/test_ad_disagg.py::test_reduced_layer_handoff_matches_aggregate[tinyllama] SKIP (https://nvbugs/6306936) -disaggregated/test_ad_disagg.py::test_tinyllama_batch_handoff_semantic_slots SKIP (https://nvbugs/6306936) -disaggregated/test_ad_disagg_trtllm_serve.py::test_openai_completion SKIP (https://nvbugs/6306936) disaggregated/test_disaggregated.py::test_disaggregated_cancel_large_context_requests[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6105768) disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_cache_aware_balance[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6162322) disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_conditional[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/6162322) @@ -465,17 +453,6 @@ unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_ unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingRenormalize_qwen_next-swiglu-1024-1024-150] SKIP (https://nvbugs/5908070) unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingRenormalize_topk_4-swiglu-1024-1024-150] SKIP (https://nvbugs/5908070) unittest/_torch/visual_gen/test_flux_pipeline.py::TestFluxCombinedOptimizations::test_all_optimizations_combined SKIP (https://nvbugs/6199854) -unittest/auto_deploy/singlegpu/smoke SKIP (https://nvbugs/6306936) -unittest/auto_deploy/singlegpu/smoke/test_disagg.py::test_autodeploy_disaggregated_batch_smoke[deepseek-trtllm-simple] SKIP (https://nvbugs/6307525) -unittest/auto_deploy/singlegpu/smoke/test_disagg.py::test_autodeploy_disaggregated_batch_smoke[llama-flashinfer-cudagraph] SKIP (https://nvbugs/6307525) -unittest/auto_deploy/singlegpu/smoke/test_disagg.py::test_autodeploy_disaggregated_batch_smoke[llama-flashinfer-simple] SKIP (https://nvbugs/6307525) -unittest/auto_deploy/singlegpu/smoke/test_disagg.py::test_autodeploy_disaggregated_batch_smoke[llama-trtllm-cudagraph] SKIP (https://nvbugs/6307525) -unittest/auto_deploy/singlegpu/smoke/test_disagg.py::test_autodeploy_disaggregated_batch_smoke[llama-trtllm-simple] SKIP (https://nvbugs/6307525) -unittest/auto_deploy/singlegpu/smoke/test_disagg.py::test_autodeploy_disaggregated_smoke[deepseek-trtllm-simple] SKIP (https://nvbugs/6307525) -unittest/auto_deploy/singlegpu/smoke/test_disagg.py::test_autodeploy_disaggregated_smoke[llama-flashinfer-cudagraph] SKIP (https://nvbugs/6307525) -unittest/auto_deploy/singlegpu/smoke/test_disagg.py::test_autodeploy_disaggregated_smoke[llama-flashinfer-simple] SKIP (https://nvbugs/6307525) -unittest/auto_deploy/singlegpu/smoke/test_disagg.py::test_autodeploy_disaggregated_smoke[llama-trtllm-cudagraph] SKIP (https://nvbugs/6307525) -unittest/auto_deploy/singlegpu/smoke/test_disagg.py::test_autodeploy_disaggregated_smoke[llama-trtllm-simple] SKIP (https://nvbugs/6307525) unittest/bindings/test_transfer_agent_bindings.py::TestNixlFunctionalTransfer::test_nixl_wait_in_progress_on_zero_timeout SKIP (https://nvbugs/6260897) unittest/executor/test_rpc.py::TestRpcCorrectness::test_incremental_task_async SKIP (https://nvbugs/5741476) unittest/executor/test_rpc_proxy.py SKIP (https://nvbugs/5605741) From 5319704fa1af202ac99fa49fba180125fa2e237f Mon Sep 17 00:00:00 2001 From: Govind Ramnarayan <105831528+govind-ramnarayan@users.noreply.github.com> Date: Wed, 17 Jun 2026 15:58:56 -0700 Subject: [PATCH 2/5] [None][test] Disable piecewise cudagraph for speculative AD tests Signed-off-by: Govind Ramnarayan <105831528+govind-ramnarayan@users.noreply.github.com> --- .../model_registry/configs/super_v3_mtp.yaml | 3 +++ .../defs/accuracy/test_llm_api_autodeploy.py | 12 +++++++++--- .../integration/defs/disaggregated/test_ad_disagg.py | 8 ++++++++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/examples/auto_deploy/model_registry/configs/super_v3_mtp.yaml b/examples/auto_deploy/model_registry/configs/super_v3_mtp.yaml index c3b5911ff956..d1331190335c 100644 --- a/examples/auto_deploy/model_registry/configs/super_v3_mtp.yaml +++ b/examples/auto_deploy/model_registry/configs/super_v3_mtp.yaml @@ -19,6 +19,9 @@ speculative_config: num_nextn_predict_layers: 6 mtp_eagle_one_model: true transforms: + compile_model: + # MTP speculative decoding does not support piecewise CUDA graph capture yet. + piecewise_enabled: false detect_sharding: allreduce_strategy: NCCL # NOTE: add 'tp' to sharding dims only for high-throughput runs diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py index 902f7addf53a..06df87b6b763 100644 --- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py +++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py @@ -168,6 +168,14 @@ def low_memory_overrides(config, return config +def disable_piecewise_cuda_graph_for_speculation(config: dict) -> dict: + """Disable piecewise CUDA graph capture for speculative AutoDeploy tests.""" + config.setdefault("transforms", + {}).setdefault("compile_model", + {})["piecewise_enabled"] = False + return config + + def reduced_model_kwargs(num_hidden_layers: int, model_path: str | None = None) -> dict: """Return model_kwargs to cap a model at ``num_hidden_layers`` layers. @@ -370,9 +378,7 @@ def get_default_kwargs(self, attn_backend="flashinfer"): "torch_dtype": "bfloat16" }, } - kwargs.setdefault("transforms", - {}).setdefault("compile_model", - {})["piecewise_enabled"] = False + disable_piecewise_cuda_graph_for_speculation(kwargs) return kwargs diff --git a/tests/integration/defs/disaggregated/test_ad_disagg.py b/tests/integration/defs/disaggregated/test_ad_disagg.py index 9308c5011532..cbd743f6a499 100644 --- a/tests/integration/defs/disaggregated/test_ad_disagg.py +++ b/tests/integration/defs/disaggregated/test_ad_disagg.py @@ -153,6 +153,12 @@ def seed_disagg(): torch.cuda.manual_seed_all(AUTODEPLOY_DISAGG_SEED) +def disable_piecewise_cuda_graph_for_speculation(config: dict) -> dict: + """Disable piecewise CUDA graph capture for speculative AutoDeploy tests.""" + config.setdefault("transforms", {}).setdefault("compile_model", {})["piecewise_enabled"] = False + return config + + def base_config(extra_config=None): common_config = dict( runtime="trtllm", @@ -167,6 +173,8 @@ def base_config(extra_config=None): ) if extra_config: common_config.update(extra_config) + if common_config.get("speculative_config") is not None: + disable_piecewise_cuda_graph_for_speculation(common_config) return common_config From 235217e144e964e62617979809c7e76426f35b0e Mon Sep 17 00:00:00 2001 From: Govind Ramnarayan <105831528+govind-ramnarayan@users.noreply.github.com> Date: Wed, 17 Jun 2026 16:09:09 -0700 Subject: [PATCH 3/5] [None][test] Reject AD piecewise cudagraph with speculation Signed-off-by: Govind Ramnarayan <105831528+govind-ramnarayan@users.noreply.github.com> --- tensorrt_llm/_torch/auto_deploy/llm_args.py | 14 +++++++++++++ .../singlegpu/shim/test_llm_config.py | 21 ++++++++++++++++--- .../smoke/test_ad_speculative_decoding.py | 10 +++++++++ 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/tensorrt_llm/_torch/auto_deploy/llm_args.py b/tensorrt_llm/_torch/auto_deploy/llm_args.py index 4b52e53aa568..afc18d3b3790 100644 --- a/tensorrt_llm/_torch/auto_deploy/llm_args.py +++ b/tensorrt_llm/_torch/auto_deploy/llm_args.py @@ -443,6 +443,20 @@ def reject_cudagraph_for_speculative_flashinfer(self): ) return self + @model_validator(mode="after") + def reject_piecewise_cuda_graph_for_speculative_decoding(self): + compile_model = self.transforms.get("compile_model", {}) + if ( + self.speculative_config is not None + and self.is_cuda_graph_enabled() + and compile_model.get("piecewise_enabled", False) + ): + raise ValueError( + "Speculative decoding with AutoDeploy does not currently support piecewise CUDA " + "graph capture." + ) + return self + @model_validator(mode="after") def disable_piecewise_for_non_piecewise_backend(self): compile_model = self.transforms.get("compile_model") diff --git a/tests/unittest/auto_deploy/singlegpu/shim/test_llm_config.py b/tests/unittest/auto_deploy/singlegpu/shim/test_llm_config.py index 85553ebab42a..5639344dd18c 100644 --- a/tests/unittest/auto_deploy/singlegpu/shim/test_llm_config.py +++ b/tests/unittest/auto_deploy/singlegpu/shim/test_llm_config.py @@ -275,6 +275,10 @@ class TestSpeculativeConfigValidation: Verify that supported speculative modes are accepted and configured before executor setup. """ + @staticmethod + def piecewise_disabled_transforms(): + return {"compile_model": {"piecewise_enabled": False}} + def test_accepts_eagle_one_model(self): from tensorrt_llm.llmapi import EagleDecodingConfig @@ -284,7 +288,11 @@ def test_accepts_eagle_one_model(self): eagle3_one_model=True, ) # Should not raise. - args = LlmArgs(model="test-model", speculative_config=spec_config) + args = LlmArgs( + model="test-model", + speculative_config=spec_config, + transforms=self.piecewise_disabled_transforms(), + ) assert args.model_factory == "eagle_one_model" def test_accepts_mtp_eagle_one_model(self): @@ -295,7 +303,11 @@ def test_accepts_mtp_eagle_one_model(self): mtp_eagle_one_model=True, ) # Should not raise. - args = LlmArgs(model="test-model", speculative_config=spec_config) + args = LlmArgs( + model="test-model", + speculative_config=spec_config, + transforms=self.piecewise_disabled_transforms(), + ) assert args.model_factory == "eagle_one_model" @pytest.mark.parametrize("compile_backend", ["torch-cudagraph", "torch-opt"]) @@ -356,7 +368,10 @@ def test_ssm_replay_with_spec_ok(self): args = LlmArgs( model="test-model", speculative_config=spec_config, - transforms={"insert_cached_ssm_attention": {"ssm_replay": True}}, + transforms={ + "compile_model": {"piecewise_enabled": False}, + "insert_cached_ssm_attention": {"ssm_replay": True}, + }, ) assert args.transforms["insert_cached_ssm_attention"]["ssm_replay"] is True diff --git a/tests/unittest/auto_deploy/singlegpu/smoke/test_ad_speculative_decoding.py b/tests/unittest/auto_deploy/singlegpu/smoke/test_ad_speculative_decoding.py index 90113f1ad6a0..245cc79ec1f2 100644 --- a/tests/unittest/auto_deploy/singlegpu/smoke/test_ad_speculative_decoding.py +++ b/tests/unittest/auto_deploy/singlegpu/smoke/test_ad_speculative_decoding.py @@ -45,6 +45,10 @@ def get_extra_seq_len_for_kv_cache(llm_args) -> int: return extra +def piecewise_disabled_transforms(): + return {"compile_model": {"piecewise_enabled": False}} + + def test_super_mtp_smoke(): """Test one-model MTP/Eagle runtime with a tiny Nemotron SuperV3 target.""" test_prompt = "What is the capital of France?" @@ -190,6 +194,7 @@ def test_kv_cache_extra_seq_len_for_spec_dec(): model="meta-llama/Meta-Llama-3.1-8B-Instruct", speculative_config=spec_config, disable_overlap_scheduler=True, + transforms=piecewise_disabled_transforms(), ) extra = get_extra_seq_len_for_kv_cache(args_eagle) # Should include max_total_draft_tokens + get_num_extra_kv_tokens (max_draft_len - 1) @@ -201,6 +206,7 @@ def test_kv_cache_extra_seq_len_for_spec_dec(): model="meta-llama/Meta-Llama-3.1-8B-Instruct", speculative_config=spec_config, disable_overlap_scheduler=False, + transforms=piecewise_disabled_transforms(), ) extra_overlap = get_extra_seq_len_for_kv_cache(args_eagle_overlap) # Should be more than without overlap @@ -217,6 +223,7 @@ def test_mtp_autodeploy_uses_eagle_one_model_capture(): num_nextn_predict_layers=3, mtp_eagle_one_model=True, ), + transforms=piecewise_disabled_transforms(), ) assert isinstance(args.speculative_config, MTPDecodingConfig) @@ -229,6 +236,9 @@ def test_detect_hidden_states_capture_last_layer_for_mtp_eagle_one_model(): from tensorrt_llm._torch.auto_deploy.llm_args import LlmArgs config = get_small_model_config("meta-llama/Meta-Llama-3.1-8B-Instruct") + config["args"].setdefault("transforms", {}).setdefault("compile_model", {})[ + "piecewise_enabled" + ] = False args = LlmArgs( **config["args"], From 3e2cc4353949064f8343eb5a9eba7d0b72457900 Mon Sep 17 00:00:00 2001 From: Govind Ramnarayan <105831528+govind-ramnarayan@users.noreply.github.com> Date: Wed, 17 Jun 2026 16:14:14 -0700 Subject: [PATCH 4/5] [None][test] Revert redundant AD accuracy helper Signed-off-by: Govind Ramnarayan <105831528+govind-ramnarayan@users.noreply.github.com> --- .../defs/accuracy/test_llm_api_autodeploy.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py index 06df87b6b763..902f7addf53a 100644 --- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py +++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py @@ -168,14 +168,6 @@ def low_memory_overrides(config, return config -def disable_piecewise_cuda_graph_for_speculation(config: dict) -> dict: - """Disable piecewise CUDA graph capture for speculative AutoDeploy tests.""" - config.setdefault("transforms", - {}).setdefault("compile_model", - {})["piecewise_enabled"] = False - return config - - def reduced_model_kwargs(num_hidden_layers: int, model_path: str | None = None) -> dict: """Return model_kwargs to cap a model at ``num_hidden_layers`` layers. @@ -378,7 +370,9 @@ def get_default_kwargs(self, attn_backend="flashinfer"): "torch_dtype": "bfloat16" }, } - disable_piecewise_cuda_graph_for_speculation(kwargs) + kwargs.setdefault("transforms", + {}).setdefault("compile_model", + {})["piecewise_enabled"] = False return kwargs From aa3f6a24e1340e2bc9956a1e7eef6ff85544edea Mon Sep 17 00:00:00 2001 From: Govind Ramnarayan <105831528+govind-ramnarayan@users.noreply.github.com> Date: Mon, 22 Jun 2026 13:57:44 -0700 Subject: [PATCH 5/5] default max_seq_len for small model tests to prevent bad defaults from factory. Maybe we want better defaults in the future but production config files are agent-generated so they dont have this issue when they set max_seq_len Signed-off-by: Govind Ramnarayan <105831528+govind-ramnarayan@users.noreply.github.com> --- tests/unittest/auto_deploy/_utils_test/_model_test_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unittest/auto_deploy/_utils_test/_model_test_utils.py b/tests/unittest/auto_deploy/_utils_test/_model_test_utils.py index 6069e8a0526c..e7b79ce4fd86 100644 --- a/tests/unittest/auto_deploy/_utils_test/_model_test_utils.py +++ b/tests/unittest/auto_deploy/_utils_test/_model_test_utils.py @@ -641,6 +641,7 @@ def get_small_model_config(model_hub_id: str, **llm_args_kwargs) -> Dict[str, An "free_gpu_memory_fraction": 0.0, # No resizing of the cache to keep the mem footprint small } llm_args["max_batch_size"] = 2 # Minimum batching to speed up things + llm_args["max_seq_len"] = 256 llm_args["cuda_graph_config"] = {"max_batch_size": 2} # Match max_batch_size # update with custom llm_args kwargs llm_args.update(llm_args_kwargs)