3535from tensorrt_llm .inputs import prompt_inputs
3636from tensorrt_llm .inputs .data import TokensPrompt
3737from tensorrt_llm .inputs .multimodal import MultimodalServerConfig
38- from tensorrt_llm .inputs .utils import ConversationMessage , apply_chat_template
38+ from tensorrt_llm .inputs .utils import (ConversationMessage ,
39+ async_apply_chat_template )
3940from tensorrt_llm .llmapi import DisaggregatedParams as LlmDisaggregatedParams
4041from tensorrt_llm .llmapi import MultimodalEncoder , SchedulingParams , tracing
4142from tensorrt_llm .llmapi .disagg_utils import (DisaggClusterConfig ,
@@ -1261,7 +1262,7 @@ async def chat_stream_generator(
12611262 if request .prompt_token_ids is not None :
12621263 prompt = request .prompt_token_ids
12631264 else :
1264- prompt : str = apply_chat_template (
1265+ prompt_task = async_apply_chat_template (
12651266 model_type = resolve_top_level_model_type (self .model_config ),
12661267 tokenizer = self .tokenizer ,
12671268 processor = self .processor ,
@@ -1273,9 +1274,12 @@ async def chat_stream_generator(
12731274 chat_template = request .chat_template or self .chat_template ,
12741275 chat_template_kwargs = request .chat_template_kwargs or {},
12751276 )
1277+ prompt , (mm_data , mm_embeddings ) = await asyncio .gather (
1278+ prompt_task , mm_coroutines )
12761279 prompt = prompt_inputs (prompt )
12771280
1278- mm_data , mm_embeddings = await mm_coroutines
1281+ if request .prompt_token_ids is not None :
1282+ mm_data , mm_embeddings = await mm_coroutines
12791283 if mm_data :
12801284 prompt ["multi_modal_data" ] = mm_data
12811285 if mm_embeddings :
@@ -1414,7 +1418,7 @@ async def create_mm_embedding_response(promise: RequestOutput):
14141418 if request .prompt_token_ids is not None :
14151419 prompt = request .prompt_token_ids
14161420 else :
1417- prompt : str = apply_chat_template (
1421+ prompt_task = async_apply_chat_template (
14181422 model_type = resolve_top_level_model_type (self .model_config ),
14191423 tokenizer = self .tokenizer ,
14201424 processor = self .processor ,
@@ -1426,9 +1430,12 @@ async def create_mm_embedding_response(promise: RequestOutput):
14261430 chat_template = request .chat_template ,
14271431 chat_template_kwargs = request .chat_template_kwargs or {},
14281432 )
1433+ prompt , (mm_data , mm_embeddings ) = await asyncio .gather (
1434+ prompt_task , mm_coroutines )
14291435 prompt = prompt_inputs (prompt )
14301436
1431- mm_data , mm_embeddings = await mm_coroutines
1437+ if request .prompt_token_ids is not None :
1438+ mm_data , mm_embeddings = await mm_coroutines
14321439 if mm_embeddings :
14331440 raise ValueError ("Cannot use multimodal embeddings as input" )
14341441 if mm_data is not None :
0 commit comments