3535from tensorrt_llm .inputs import prompt_inputs
3636from tensorrt_llm .inputs .data import TokensPrompt
3737from tensorrt_llm .inputs .multimodal import MultimodalServerConfig
38- from tensorrt_llm .inputs .utils import ConversationMessage , apply_chat_template
38+ from tensorrt_llm .inputs .utils import (ConversationMessage ,
39+ async_apply_chat_template )
3940from tensorrt_llm .llmapi import DisaggregatedParams as LlmDisaggregatedParams
4041from tensorrt_llm .llmapi import MultimodalEncoder , SchedulingParams , tracing
4142from tensorrt_llm .llmapi .disagg_utils import (DisaggClusterConfig ,
@@ -1241,7 +1242,7 @@ async def chat_stream_generator(
12411242 if request .prompt_token_ids is not None :
12421243 prompt = request .prompt_token_ids
12431244 else :
1244- prompt : str = apply_chat_template (
1245+ prompt_task = async_apply_chat_template (
12451246 model_type = resolve_top_level_model_type (self .model_config ),
12461247 tokenizer = self .tokenizer ,
12471248 processor = self .processor ,
@@ -1253,9 +1254,12 @@ async def chat_stream_generator(
12531254 chat_template = request .chat_template or self .chat_template ,
12541255 chat_template_kwargs = request .chat_template_kwargs or {},
12551256 )
1257+ prompt , (mm_data , mm_embeddings ) = await asyncio .gather (
1258+ prompt_task , mm_coroutines )
12561259 prompt = prompt_inputs (prompt )
12571260
1258- mm_data , mm_embeddings = await mm_coroutines
1261+ if request .prompt_token_ids is not None :
1262+ mm_data , mm_embeddings = await mm_coroutines
12591263 if mm_data :
12601264 prompt ["multi_modal_data" ] = mm_data
12611265 if mm_embeddings :
@@ -1394,7 +1398,7 @@ async def create_mm_embedding_response(promise: RequestOutput):
13941398 if request .prompt_token_ids is not None :
13951399 prompt = request .prompt_token_ids
13961400 else :
1397- prompt : str = apply_chat_template (
1401+ prompt_task = async_apply_chat_template (
13981402 model_type = resolve_top_level_model_type (self .model_config ),
13991403 tokenizer = self .tokenizer ,
14001404 processor = self .processor ,
@@ -1406,9 +1410,12 @@ async def create_mm_embedding_response(promise: RequestOutput):
14061410 chat_template = request .chat_template ,
14071411 chat_template_kwargs = request .chat_template_kwargs or {},
14081412 )
1413+ prompt , (mm_data , mm_embeddings ) = await asyncio .gather (
1414+ prompt_task , mm_coroutines )
14091415 prompt = prompt_inputs (prompt )
14101416
1411- mm_data , mm_embeddings = await mm_coroutines
1417+ if request .prompt_token_ids is not None :
1418+ mm_data , mm_embeddings = await mm_coroutines
14121419 if mm_embeddings :
14131420 raise ValueError ("Cannot use multimodal embeddings as input" )
14141421 if mm_data is not None :
0 commit comments