fixing smoke test failures; mostly flashinfer shapes

govind-ramnarayan · govind-ramnarayan · commit d66f329826f5 · 2026-06-16T11:50:32.000-07:00
Signed-off-by: Govind Ramnarayan &lt;105831528+govind-ramnarayan@users.noreply.github.com&gt;
diff --git a/tests/unittest/auto_deploy/singlegpu/smoke/test_ad_build_small_single.py b/tests/unittest/auto_deploy/singlegpu/smoke/test_ad_build_small_single.py
@@ -214,7 +214,10 @@ def _check_ad_config(experiment_config: ExperimentConfig, llm_args: LlmArgs):
             "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
             {
                 "transforms": {
-                    "insert_cached_attention": {"backend": "flashinfer"},
+                    "insert_cached_attention": {
+                        "backend": "flashinfer",
+                        "requires_shape_prop": True,
+                    },
                     "compile_model": {
                         "backend": "torch-cudagraph",
                         "cuda_graph_batch_sizes": [1, 2],
@@ -295,6 +298,22 @@ def test_build_ad(model_hub_id: str, llm_extra_args: dict):
     experiment_config = get_small_model_config(model_hub_id, **llm_extra_args)
     experiment_config["args"]["runtime"] = "demollm"  # Default runtime set to demollm
     experiment_config["args"]["world_size"] = 0  # Default world_size set to 0
+    if (
+        model_hub_id == "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+        and llm_extra_args.get("transforms", {})
+        .get("compile_model", {})
+        .get("backend")
+        == "torch-cudagraph"
+    ):
+        experiment_config["args"]["max_batch_size"] = 1
+        experiment_config["args"]["max_input_len"] = 64
+        experiment_config["args"]["max_seq_len"] = 128
+        experiment_config["args"]["max_num_tokens"] = 128
+        experiment_config["args"]["cuda_graph_config"] = {
+            "batch_sizes": [1],
+            "max_batch_size": 1,
+        }
+        experiment_config["prompt"]["batch_size"] = 1
 
     print(f"Experiment Config: {experiment_config}")
     experiment_config = ExperimentConfig(**experiment_config)
diff --git a/tests/unittest/auto_deploy/singlegpu/smoke/test_ad_guided_decoding_regex.py b/tests/unittest/auto_deploy/singlegpu/smoke/test_ad_guided_decoding_regex.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -37,6 +37,14 @@ def test_ad_guided_decoding_regex_e2e():
     # NOTE: trtllm attention backend fails on B200 (likely illegal memory access); use flashinfer.
     experiment_config["args"]["attn_backend"] = "flashinfer"
     experiment_config["args"]["guided_decoding_backend"] = guided_decoding_backend
+    experiment_config["args"]["max_batch_size"] = 1
+    experiment_config["args"]["max_input_len"] = 64
+    experiment_config["args"]["max_seq_len"] = 128
+    experiment_config["args"]["max_num_tokens"] = 128
+    experiment_config["args"]["cuda_graph_config"] = {
+        "batch_sizes": [1],
+        "max_batch_size": 1,
+    }
 
     experiment_config["prompt"]["batch_size"] = 1
     experiment_config["prompt"]["queries"] = test_case["prompt"]
@@ -46,7 +54,7 @@ def test_ad_guided_decoding_regex_e2e():
     # Need to introduce the guided decoding params after ExperimentConfig construction
     # because otherwise they get unpacked as a dict.
     cfg.prompt.sp_kwargs = {
-        "max_tokens": 10,
+        "max_tokens": 16,
         "top_k": None,
         "temperature": 0.1,
         "guided_decoding": GuidedDecodingParams(regex=test_case["regex"]),
diff --git a/tests/unittest/auto_deploy/singlegpu/smoke/test_ad_trtllm_sampler.py b/tests/unittest/auto_deploy/singlegpu/smoke/test_ad_trtllm_sampler.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,6 +31,14 @@ def test_ad_trtllm_sampler_smoke():
     # NOTE: trtllm attention backend fails on B200 (likely illegal memory access); use flashinfer.
     experiment_config["args"]["attn_backend"] = "flashinfer"
     experiment_config["args"]["sampler_type"] = SamplerType.TRTLLMSampler
+    experiment_config["args"]["max_batch_size"] = 1
+    experiment_config["args"]["max_input_len"] = 64
+    experiment_config["args"]["max_seq_len"] = 128
+    experiment_config["args"]["max_num_tokens"] = 128
+    experiment_config["args"]["cuda_graph_config"] = {
+        "batch_sizes": [1],
+        "max_batch_size": 1,
+    }
 
     # Setup simple prompt
     experiment_config["prompt"]["batch_size"] = 1
diff --git a/tests/unittest/auto_deploy/singlegpu/smoke/test_ad_trtllm_serve.py b/tests/unittest/auto_deploy/singlegpu/smoke/test_ad_trtllm_serve.py
@@ -41,6 +41,14 @@ def test_trtllm_serve_openai_chat_completion(tmp_path):
 
     # NOTE: trtllm attention backend fails on B200 (likely illegal memory access); use flashinfer.
     extra_args["attn_backend"] = "flashinfer"
+    extra_args["max_batch_size"] = 1
+    extra_args["max_input_len"] = 64
+    extra_args["max_seq_len"] = 128
+    extra_args["max_num_tokens"] = 128
+    extra_args["cuda_graph_config"] = {
+        "batch_sizes": [1],
+        "max_batch_size": 1,
+    }
     extra_options_path = tmp_path / "extra_llm_api_options.yaml"
     with open(extra_options_path, "w") as f:
         yaml.safe_dump(extra_args, f)