3434from tensorrt_llm .inputs import prompt_inputs
3535from tensorrt_llm .inputs .data import TokensPrompt
3636from tensorrt_llm .inputs .multimodal import MultimodalServerConfig
37- from tensorrt_llm .inputs .utils import ConversationMessage , apply_chat_template
37+ from tensorrt_llm .inputs .utils import (ConversationMessage ,
38+ async_apply_chat_template )
3839from tensorrt_llm .llmapi import DisaggregatedParams as LlmDisaggregatedParams
3940from tensorrt_llm .llmapi import MultimodalEncoder , SchedulingParams , tracing
4041from tensorrt_llm .llmapi .disagg_utils import (DisaggClusterConfig ,
@@ -1200,7 +1201,7 @@ async def chat_stream_generator(
12001201 if request .prompt_token_ids is not None :
12011202 prompt = request .prompt_token_ids
12021203 else :
1203- prompt : str = apply_chat_template (
1204+ prompt_task = async_apply_chat_template (
12041205 model_type = resolve_top_level_model_type (self .model_config ),
12051206 tokenizer = self .tokenizer ,
12061207 processor = self .processor ,
@@ -1212,9 +1213,12 @@ async def chat_stream_generator(
12121213 chat_template = request .chat_template or self .chat_template ,
12131214 chat_template_kwargs = request .chat_template_kwargs or {},
12141215 )
1216+ prompt , (mm_data , mm_embeddings ) = await asyncio .gather (
1217+ prompt_task , mm_coroutines )
12151218 prompt = prompt_inputs (prompt )
12161219
1217- mm_data , mm_embeddings = await mm_coroutines
1220+ if request .prompt_token_ids is not None :
1221+ mm_data , mm_embeddings = await mm_coroutines
12181222 if mm_data :
12191223 prompt ["multi_modal_data" ] = mm_data
12201224 if mm_embeddings :
@@ -1350,7 +1354,7 @@ async def create_mm_embedding_response(promise: RequestOutput):
13501354 if request .prompt_token_ids is not None :
13511355 prompt = request .prompt_token_ids
13521356 else :
1353- prompt : str = apply_chat_template (
1357+ prompt_task = async_apply_chat_template (
13541358 model_type = resolve_top_level_model_type (self .model_config ),
13551359 tokenizer = self .tokenizer ,
13561360 processor = self .processor ,
@@ -1362,9 +1366,12 @@ async def create_mm_embedding_response(promise: RequestOutput):
13621366 chat_template = request .chat_template ,
13631367 chat_template_kwargs = request .chat_template_kwargs or {},
13641368 )
1369+ prompt , (mm_data , mm_embeddings ) = await asyncio .gather (
1370+ prompt_task , mm_coroutines )
13651371 prompt = prompt_inputs (prompt )
13661372
1367- mm_data , mm_embeddings = await mm_coroutines
1373+ if request .prompt_token_ids is not None :
1374+ mm_data , mm_embeddings = await mm_coroutines
13681375 if mm_embeddings :
13691376 raise ValueError ("Cannot use multimodal embeddings as input" )
13701377 if mm_data is not None :
0 commit comments