codex addressed review comments

govind-ramnarayan · govind-ramnarayan · commit e2c608de7835 · 2026-05-13T16:36:54.000-07:00
Signed-off-by: Govind Ramnarayan &lt;105831528+govind-ramnarayan@users.noreply.github.com&gt;
diff --git a/examples/auto_deploy/model_registry/configs/disagg_ctx.yaml b/examples/auto_deploy/model_registry/configs/disagg_ctx.yaml
@@ -2,4 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 cache_transceiver_config:
   backend: DEFAULT
+# Prefill workers must run without overlap scheduling because the current
+# context-only transfer path sends KV cache when a request completes its context
+# phase, after all prefill chunks have run.
 disable_overlap_scheduler: true
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/flashinfer_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/flashinfer_attention.py
@@ -589,6 +589,7 @@ def get_cache_initializers(
                 dtype=cls.resolve_cache_dtype(cache_config.dtype, k_fake.dtype),
                 kv_factor=2,
                 kv_layout=_GlobalFlashInferPlanner.kv_layout,
+                attention_type="mha",
             )
         }
 
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/triton_paged_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/triton_paged_attention.py
@@ -1506,6 +1506,7 @@ def get_cache_initializers(
                 dtype=cls.resolve_cache_dtype(cache_config.dtype, k_fake.dtype),
                 kv_factor=2,
                 kv_layout=KV_LAYOUT,
+                attention_type="mha",
             )
         }
 
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/trtllm_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/trtllm_attention.py
@@ -877,6 +877,7 @@ def get_cache_initializers(
                 dtype=cls.resolve_cache_dtype(cache_config.dtype, kv_dtype),
                 kv_factor=2,
                 kv_layout="HND",
+                attention_type="mha",
             )
         }
 
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py
@@ -39,7 +39,7 @@
 from ..utils.node_utils import extract_op_args, get_op_schema
 
 Constant = Union[int, float, str, None]
-AttentionType = Literal["default", "mla"]
+AttentionType = Literal["mha", "mla"]
 
 # Torch dtype → numpy dtype for fast list-to-tensor conversion.
 # numpy's list→array conversion is ~2-3x faster than torch.tensor(list) for large lists.
@@ -644,7 +644,7 @@ def __init__(
         # will store num_blocks later...
         self._num_blocks = None
 
-        self.attention_type: AttentionType = "default"
+        self.attention_type: Optional[AttentionType] = None
 
         # TODO (lucaslie): can we remove this eventually from this i/f?
         self.vocab_size_padded = vocab_size_padded
@@ -1582,7 +1582,7 @@ class KVPagedResourceHandler(ResourceHandler):
         kv_factor: The factor of the KV cache. Default is 2 for combined k/v cache.
         kv_layout: Memory layout for the KV cache. Either "HND" (head-num-dim) or "NHD" (num-head-dim).
             Default is "HND" which is the standard layout for flashinfer.
-        attention_type: Attention semantics for this cache resource.
+        attention_type: Attention layout semantics for this cache resource: ``"mha"`` or ``"mla"``.
     """
 
     @property
@@ -1595,9 +1595,9 @@ def __init__(
         num_kv_heads: int,
         head_dim: int,
         dtype: torch.dtype,
+        attention_type: AttentionType,
         kv_factor: int = 2,
         kv_layout: Literal["HND", "NHD"] = "HND",
-        attention_type: AttentionType = "default",
     ) -> None:
         """Initialize the KVPagedResourceHandler.
 
@@ -1607,15 +1607,15 @@ def __init__(
             dtype: The dtype of the KV cache.
             kv_factor: The factor of the KV cache. Default is 2.
             kv_layout: Memory layout - "HND" or "NHD". Default is "HND".
-            attention_type: Attention semantics for this cache resource.
+            attention_type: Attention layout semantics for this cache resource: ``"mha"`` or ``"mla"``.
         """
         self.num_kv_heads = num_kv_heads
         self.head_dim = head_dim
         self.dtype = dtype
         self.kv_factor = kv_factor
         assert kv_factor in [1, 2], f"Invalid kv_factor: {kv_factor}"
         self.kv_layout = kv_layout
-        assert attention_type in ["default", "mla"], f"Unsupported attention_type: {attention_type}"
+        assert attention_type in ["mha", "mla"], f"Unsupported attention_type: {attention_type}"
         self.attention_type = attention_type
 
     def __eq__(self, other: Optional[ResourceHandler]) -> bool:
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -80,7 +80,7 @@
 from .interface import CachedSequenceInterface, GetInferenceModel
 
 _ATTENTION_TYPE_TO_CPP = {
-    "default": AttentionTypeCpp.DEFAULT,
+    "mha": AttentionTypeCpp.DEFAULT,
     "mla": AttentionTypeCpp.MLA,
 }
 
@@ -846,9 +846,10 @@ def _prepare_inputs(
         num_prefill_tokens = len(input_ids)
 
         for request in gen_requests:
-            # Overlap gathers tokens from the previous batch slot. Dummy padding
-            # requests and first-step generation-only disagg requests do not have
-            # a previous slot to gather from yet.
+            # Overlap gathers tokens from the previous batch slot. Non-overlap
+            # forwards do not pass new_tokens at all; first-step generation-only
+            # disagg requests may have new_tokens from a previous batch but do
+            # not have a previous batch slot (py_batch_idx) to gather from yet.
             is_overlap = (
                 has_new_tokens
                 and not self._disable_overlap_scheduler
@@ -1386,17 +1387,28 @@ def create_autodeploy_executor(
         kv_cache_manager if isinstance(kv_cache_manager, BaseMambaCacheManager) else None
     )
     cache_transceiver_config = ad_config.cache_transceiver_config
-    if cache_transceiver_config and cache_transceiver_config.max_tokens_in_buffer is None:
+    cache_transceiver_enabled = (
+        cache_transceiver_config is not None and cache_transceiver_config.backend is not None
+    )
+    if cache_transceiver_enabled and cache_transceiver_config.max_tokens_in_buffer is None:
         # The disagg transfer buffer must fit the full context segment handed to
         # the generation worker. AutoDeploy's cache interface exposes the tuned
         # maximum sequence length, which is a conservative upper bound.
         cache_transceiver_config.max_tokens_in_buffer = engine.cache_seq_interface.info.max_seq_len
 
     cache_attention_type = engine.cache_seq_interface.attention_type
-    try:
-        attention_type = _ATTENTION_TYPE_TO_CPP[cache_attention_type]
-    except KeyError as exc:
-        raise ValueError(f"Unsupported attention_type: {cache_attention_type!r}") from exc
+    if cache_transceiver_enabled and cache_attention_type is None:
+        raise RuntimeError(
+            "Cache transceiver is enabled, but AutoDeploy did not find a managed paged KV "
+            "resource to provide attention_type."
+        )
+    if cache_attention_type is None:
+        attention_type = AttentionTypeCpp.DEFAULT
+    else:
+        try:
+            attention_type = _ATTENTION_TYPE_TO_CPP[cache_attention_type]
+        except KeyError as exc:
+            raise ValueError(f"Unsupported attention_type: {cache_attention_type!r}") from exc
 
     kv_cache_transceiver = create_kv_cache_transceiver(
         dist_mapping,
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/interface.py b/tensorrt_llm/_torch/auto_deploy/shim/interface.py
@@ -699,7 +699,7 @@ def _create_kv_cache_manager(self, max_tokens: Optional[int] = None) -> Dict:
         """
         # 1. Identify managed resources
         kv_ref, kv_managed = self._identify_managed_kv_resources()
-        self.info.attention_type = kv_ref.attention_type if kv_ref is not None else "default"
+        self.info.attention_type = kv_ref.attention_type if kv_ref is not None else None
         ssm_ref, ssm_managed, ssm_spec, conv_ref, conv_managed, conv_spec = (
             self._identify_managed_state_resources()
         )
@@ -939,7 +939,7 @@ def kv_cache_config(self) -> KvCacheConfig:
         return self._kv_cache_config_original
 
     @property
-    def attention_type(self) -> AttentionType:
+    def attention_type(self) -> Optional[AttentionType]:
         return self.info.attention_type
 
     def _clear_caches(self) -> None:
diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
@@ -1,18 +1,3 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import asyncio
 import os
 import pickle
diff --git a/tests/unittest/auto_deploy/singlegpu/custom_ops/test_resource_handlers.py b/tests/unittest/auto_deploy/singlegpu/custom_ops/test_resource_handlers.py
@@ -31,7 +31,9 @@
 
 def test_paged_handler_with_nhd_layout():
     """Test KVPagedResourceHandler with NHD layout."""
-    handler = KVPagedResourceHandler(8, 64, dtype=torch.bfloat16, kv_layout="NHD")
+    handler = KVPagedResourceHandler(
+        8, 64, dtype=torch.bfloat16, kv_layout="NHD", attention_type="mha"
+    )
     assert handler.num_kv_heads == 8
     assert handler.head_dim == 64
     assert handler.dtype == torch.bfloat16
@@ -40,7 +42,9 @@ def test_paged_handler_with_nhd_layout():
 
 def test_paged_handler_with_hnd_layout():
     """Test KVPagedResourceHandler with explicit HND layout."""
-    handler = KVPagedResourceHandler(4, 128, dtype=torch.float32, kv_layout="HND")
+    handler = KVPagedResourceHandler(
+        4, 128, dtype=torch.float32, kv_layout="HND", attention_type="mha"
+    )
     assert handler.num_kv_heads == 4
     assert handler.head_dim == 128
     assert handler.dtype == torch.float32
@@ -50,7 +54,9 @@ def test_paged_handler_with_hnd_layout():
 @pytest.mark.parametrize("kv_layout", ["HND", "NHD"])
 def test_paged_handler_allocate_with_blocks(kv_layout):
     """Verify KVPagedResourceHandler.allocate() returns correct shape."""
-    handler = KVPagedResourceHandler(8, 64, dtype=torch.float16, kv_layout=kv_layout)
+    handler = KVPagedResourceHandler(
+        8, 64, dtype=torch.float16, kv_layout=kv_layout, attention_type="mha"
+    )
     tokens_per_block = 32
     seq_info = SequenceInfo(
         max_seq_len=128,
@@ -88,7 +94,7 @@ def test_paged_handler_allocate_with_blocks(kv_layout):
 
 def test_paged_handler_is_resource_handler():
     """Verify KVPagedResourceHandler is a ResourceHandler subclass."""
-    handler = KVPagedResourceHandler(8, 64, dtype=torch.float16)
+    handler = KVPagedResourceHandler(8, 64, dtype=torch.float16, attention_type="mha")
     assert isinstance(handler, ResourceHandler)
 
 
@@ -271,9 +277,13 @@ def test_resolve_cache_dtype_explicit_fp8():
 
 def test_kv_paged_handler_eq_same_head_dim_dtype():
     """Verify KVPagedResourceHandler __eq__ checks head_dim and dtype."""
-    h1 = KVPagedResourceHandler(8, 64, dtype=torch.float16)
-    h2 = KVPagedResourceHandler(4, 64, dtype=torch.float16)  # Different num_kv_heads
-    h3 = KVPagedResourceHandler(8, 64, dtype=torch.float16, kv_layout="NHD")  # Different layout
+    h1 = KVPagedResourceHandler(8, 64, dtype=torch.float16, attention_type="mha")
+    h2 = KVPagedResourceHandler(
+        4, 64, dtype=torch.float16, attention_type="mha"
+    )  # Different num_kv_heads
+    h3 = KVPagedResourceHandler(
+        8, 64, dtype=torch.float16, kv_layout="NHD", attention_type="mha"
+    )  # Different layout
 
     # head_dim, kv_factor, dtype, kv_layout -> equal (num_kv_heads doesn't matter for compatibility)
     assert h1 == h2
@@ -282,22 +292,28 @@ def test_kv_paged_handler_eq_same_head_dim_dtype():
 
 def test_kv_paged_handler_eq_different_head_dim_or_dtype():
     """Verify KVPagedResourceHandler __eq__ returns False for different head_dim or dtype."""
-    h1 = KVPagedResourceHandler(8, 64, dtype=torch.float16)
-    h2 = KVPagedResourceHandler(8, 128, dtype=torch.float16)  # Different head_dim
-    h3 = KVPagedResourceHandler(8, 64, dtype=torch.bfloat16)  # Different dtype
+    h1 = KVPagedResourceHandler(8, 64, dtype=torch.float16, attention_type="mha")
+    h2 = KVPagedResourceHandler(
+        8, 128, dtype=torch.float16, attention_type="mha"
+    )  # Different head_dim
+    h3 = KVPagedResourceHandler(
+        8, 64, dtype=torch.bfloat16, attention_type="mha"
+    )  # Different dtype
 
     assert h1 != h2
     assert h1 != h3
 
 
 def test_kv_paged_handler_eq_different_attention_type():
     """Verify KVPagedResourceHandler __eq__ rejects different attention semantics."""
-    default_handler = KVPagedResourceHandler(8, 64, dtype=torch.float16, kv_factor=1)
+    default_handler = KVPagedResourceHandler(
+        8, 64, dtype=torch.float16, kv_factor=1, attention_type="mha"
+    )
     mla_handler = KVPagedResourceHandler(
         8, 64, dtype=torch.float16, kv_factor=1, attention_type="mla"
     )
 
-    assert default_handler.attention_type == "default"
+    assert default_handler.attention_type == "mha"
     assert mla_handler.attention_type == "mla"
     assert default_handler != mla_handler
 
diff --git a/tests/unittest/auto_deploy/singlegpu/shim/test_cached_sequence_interface.py b/tests/unittest/auto_deploy/singlegpu/shim/test_cached_sequence_interface.py
diff --git a/tests/unittest/auto_deploy/singlegpu/shim/test_create_ad_executor.py b/tests/unittest/auto_deploy/singlegpu/shim/test_create_ad_executor.py
diff --git a/tests/unittest/auto_deploy/singlegpu/smoke/test_disagg.py b/tests/unittest/auto_deploy/singlegpu/smoke/test_disagg.py
diff --git a/tests/unittest/auto_deploy/singlegpu/transformations/library/test_kv_cache.py b/tests/unittest/auto_deploy/singlegpu/transformations/library/test_kv_cache.py

Original file line number	Diff line number	Diff line change
`@@ -589,6 +589,7 @@ def get_cache_initializers(`
`589`	`589`	`dtype=cls.resolve_cache_dtype(cache_config.dtype, k_fake.dtype),`
`590`	`590`	`kv_factor=2,`
`591`	`591`	`kv_layout=_GlobalFlashInferPlanner.kv_layout,`
	`592`	`+ attention_type="mha",`
`592`	`593`	`)`
`593`	`594`	`}`
`594`	`595`
Original file line number	Diff line number	Diff line change
`@@ -1506,6 +1506,7 @@ def get_cache_initializers(`
`1506`	`1506`	`dtype=cls.resolve_cache_dtype(cache_config.dtype, k_fake.dtype),`
`1507`	`1507`	`kv_factor=2,`
`1508`	`1508`	`kv_layout=KV_LAYOUT,`
	`1509`	`+ attention_type="mha",`
`1509`	`1510`	`)`
`1510`	`1511`	`}`
`1511`	`1512`
Original file line number	Diff line number	Diff line change
`@@ -877,6 +877,7 @@ def get_cache_initializers(`
`877`	`877`	`dtype=cls.resolve_cache_dtype(cache_config.dtype, kv_dtype),`
`878`	`878`	`kv_factor=2,`
`879`	`879`	`kv_layout="HND",`
	`880`	`+ attention_type="mha",`
`880`	`881`	`)`
`881`	`882`	`}`
`882`	`883`