[None][fix] Address review comments on async chat-template offload

yechank-nvidia · yechank-nvidia · commit 45efcca57eef · 2026-06-22T07:01:59.000Z
- resource_governor: resolve the top-level model type (resolve_top_level_
  model_type) in _convert_messages, matching the serving call sites,
  instead of the raw model_config.model_type.
- responses_utils: unpack the (mm_data, mm_embeddings) tuple from the
  asyncio.gather result so _create_input_tokens returns mm_data (not the
  whole tuple) as its contract states.
- tests: add async regression coverage for both gather paths
  (ResourceGovernor._convert_messages and _create_input_tokens).

Signed-off-by: yechank &lt;161688079+yechank-nvidia@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/serve/resource_governor.py b/tensorrt_llm/serve/resource_governor.py
@@ -30,7 +30,10 @@
 from tensorrt_llm.executor.request import TruncateKVCacheRequest
 from tensorrt_llm.inputs.utils import ConversationMessage, async_apply_chat_template
 from tensorrt_llm.logger import logger
-from tensorrt_llm.serve.chat_utils import parse_chat_messages_coroutines
+from tensorrt_llm.serve.chat_utils import (
+    parse_chat_messages_coroutines,
+    resolve_top_level_model_type,
+)
 from tensorrt_llm.serve.openai_protocol import KVCacheTruncateRequest
 
 
@@ -102,7 +105,7 @@ async def _convert_messages(
             messages, self.model_config, None
         )
         token_task = async_apply_chat_template(
-            model_type=self.model_config.model_type,
+            model_type=resolve_top_level_model_type(self.model_config),
             tokenizer=self.tokenizer,
             processor=self.processor,
             conversation=conversation,
diff --git a/tensorrt_llm/serve/responses_utils.py b/tensorrt_llm/serve/responses_utils.py
@@ -835,7 +835,9 @@ async def _create_input_tokens(
         mm_placeholder_counts=mm_placeholder_counts,
         enable_tokenize=True,
     )
-    token_ids, mm_data = await asyncio.gather(token_task, mm_coroutines)
+    token_ids, (mm_data,
+                _mm_embeddings) = await asyncio.gather(token_task,
+                                                       mm_coroutines)
 
     return token_ids, mm_data
 
diff --git a/tests/unittest/inputs/test_chat_template_dispatch.py b/tests/unittest/inputs/test_chat_template_dispatch.py
@@ -357,3 +357,101 @@ def apply_chat_template(self, **_):
         assert result == "rendered"
         assert tokenizer.worker_thread_id is not None
         assert tokenizer.worker_thread_id != event_loop_thread_id
+
+
+class TestServingChatTemplateGather:
+    """Cover the asyncio.gather integration in the serving chat-template paths."""
+
+    @pytest.mark.asyncio
+    async def test_resource_governor_convert_messages(self, monkeypatch):
+        from unittest.mock import Mock
+
+        import tensorrt_llm.serve.resource_governor as rg
+
+        governor = object.__new__(rg.ResourceGovernor)
+        governor.model_config = Mock()
+        governor.tokenizer = Mock()
+        governor.processor = None
+
+        async def fake_mm_coroutine():
+            # parse_chat_messages_coroutines' coroutine yields
+            # (mm_data, mm_embeddings).
+            return ({"image": ["data"]}, None)
+
+        monkeypatch.setattr(
+            rg,
+            "parse_chat_messages_coroutines",
+            lambda messages, model_config, _: ([], fake_mm_coroutine(), [{}]),
+        )
+        # Must resolve the top-level model type, matching the serving call
+        # sites (not the raw model_config.model_type).
+        monkeypatch.setattr(rg, "resolve_top_level_model_type", lambda cfg: "resolved-model-type")
+
+        captured = {}
+
+        async def fake_async_apply(**kwargs):
+            captured.update(kwargs)
+            return [1, 2, 3]
+
+        monkeypatch.setattr(rg, "async_apply_chat_template", fake_async_apply)
+
+        token_ids = await governor._convert_messages(
+            messages=[{"role": "user", "content": "hi"}],
+            tool_dicts=None,
+            add_generation_prompt=True,
+            documents=None,
+            chat_template=None,
+            chat_template_kwargs=None,
+        )
+
+        # Returns only token_ids, not the (mm_data, mm_embeddings) tuple.
+        assert token_ids == [1, 2, 3]
+        # Uses the top-level resolver and forwards the real placeholder counts.
+        assert captured["model_type"] == "resolved-model-type"
+        assert captured["mm_placeholder_counts"] == [{}]
+
+    @pytest.mark.asyncio
+    async def test_responses_create_input_tokens_unpacks_mm_tuple(self, monkeypatch):
+        """_create_input_tokens must return mm_data, not the whole gather tuple."""
+        from unittest.mock import Mock
+
+        import tensorrt_llm.serve.responses_utils as ru
+
+        async def fake_create_input_messages(request, prev_msgs):
+            return [{"role": "user", "content": "hi"}]
+
+        async def fake_mm_coroutine():
+            return ({"image": ["data"]}, {"image": ["embed"]})
+
+        monkeypatch.setattr(ru, "_create_input_messages", fake_create_input_messages)
+        monkeypatch.setattr(
+            ru,
+            "parse_chat_messages_coroutines",
+            lambda messages, model_config: ([], fake_mm_coroutine(), [{}]),
+        )
+        monkeypatch.setattr(ru, "resolve_top_level_model_type", lambda cfg: "resolved-model-type")
+        monkeypatch.setattr(ru, "_get_chat_completion_function_tools", lambda tools: [])
+
+        async def fake_async_apply(**kwargs):
+            return [1, 2, 3]
+
+        monkeypatch.setattr(ru, "async_apply_chat_template", fake_async_apply)
+
+        request = Mock()
+        request.tools = None
+        request.store = False
+
+        token_ids, mm_data = await ru._create_input_tokens(
+            request=request,
+            prev_response=None,
+            prev_msgs=None,
+            conversation_store=None,
+            enable_store=False,
+            tokenizer=Mock(),
+            model_config=Mock(),
+            processor=None,
+        )
+
+        assert token_ids == [1, 2, 3]
+        # mm_data is the data dict, not the (mm_data, mm_embeddings) tuple.
+        assert mm_data == {"image": ["data"]}

Original file line number	Diff line number	Diff line change
`@@ -835,7 +835,9 @@ async def _create_input_tokens(`
`835`	`835`	`mm_placeholder_counts=mm_placeholder_counts,`
`836`	`836`	`enable_tokenize=True,`
`837`	`837`	`)`
`838`		`- token_ids, mm_data = await asyncio.gather(token_task, mm_coroutines)`
	`838`	`+ token_ids, (mm_data,`
	`839`	`+ _mm_embeddings) = await asyncio.gather(token_task,`
	`840`	`+ mm_coroutines)`
`839`	`841`
`840`	`842`	`return token_ids, mm_data`
`841`	`843`