Skip to content

Commit 265f3d6

Browse files
committed
FIX: handle empty chat_template in multimodal chat engines
- VLLMMultiModel: fall back to tokenizer's chat_template when model_family.chat_template is empty - SGLANGVisionModel: same logic as VLLMMultiModel, align with MLX core behavior - SGLANGModel: use empty string instead of asserting when chat_template is empty When chat_template is empty, engines will: 1. Get tokenizer and retrieve its chat_template attribute 2. If still empty, raise ValueError 3. Pass tokenizer to get_full_context for proper template application This aligns the behavior across vllm, sglang, and mlx engines.
1 parent 06d9f28 commit 265f3d6

2 files changed

Lines changed: 33 additions & 7 deletions

File tree

xinference/model/llm/sglang/core.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -716,7 +716,10 @@ async def async_chat(
716716
generate_config: Optional[Dict] = None,
717717
request_id: Optional[str] = None,
718718
) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
719-
assert self.model_family.chat_template is not None
719+
# Handle empty chat_template by using empty string (sglang server will use model's default)
720+
chat_template: str = (
721+
self.model_family.chat_template if self.model_family.chat_template else ""
722+
)
720723
# fix: Object of type list_iterator is not JSON serializable
721724
tools = list(generate_config.pop("tools", [])) if generate_config else None
722725
model_family = self.model_family.model_family or self.model_family.model_name
@@ -736,7 +739,7 @@ async def async_chat(
736739
):
737740
full_context_kwargs["tools"] = tools
738741
full_prompt = self.get_full_context(
739-
messages, self.model_family.chat_template, **full_context_kwargs
742+
messages, chat_template, **full_context_kwargs
740743
)
741744
generate_config = self._sanitize_chat_config(generate_config)
742745
stream = generate_config.get("stream", None)
@@ -822,9 +825,17 @@ async def async_chat(
822825
messages = self._transform_messages(messages)
823826

824827
tools = list(generate_config.pop("tools", [])) if generate_config else None
825-
chat_template: str = (
826-
self.model_family.chat_template if self.model_family.chat_template else ""
827-
)
828+
# Handle empty chat_template by falling back to tokenizer's chat_template
829+
chat_template = self.model_family.chat_template
830+
tokenizer = None
831+
if not chat_template:
832+
tokenizer = self._tokenizer
833+
if tokenizer is not None:
834+
chat_template = getattr(tokenizer, "chat_template", None)
835+
if not chat_template:
836+
raise ValueError(
837+
f"chat_template is required for model {self.model_uid}, but none was provided."
838+
)
828839
chat_template_kwargs = (
829840
self._get_chat_template_kwargs_from_generate_config(
830841
generate_config, self.reasoning_parser
@@ -833,7 +844,9 @@ async def async_chat(
833844
)
834845
chat_context_var.set(chat_template_kwargs)
835846
full_context_kwargs = chat_template_kwargs.copy()
836-
prompt = self.get_full_context(messages, chat_template, **full_context_kwargs)
847+
prompt = self.get_full_context(
848+
messages, chat_template, tokenizer=tokenizer, **full_context_kwargs
849+
)
837850

838851
images, video_inputs = process_vision_info(messages)
839852
if video_inputs:

xinference/model/llm/vllm/core.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1976,6 +1976,19 @@ async def async_chat(
19761976
):
19771977
full_context_kwargs["tools"] = tools
19781978
assert self.model_family.chat_template is not None
1979+
1980+
# Handle empty chat_template by falling back to tokenizer's chat_template
1981+
chat_template = self.model_family.chat_template
1982+
tokenizer = None
1983+
if not chat_template:
1984+
tokenizer = await self._get_tokenizer(None)
1985+
if tokenizer is not None:
1986+
chat_template = getattr(tokenizer, "chat_template", None)
1987+
if not chat_template:
1988+
raise ValueError(
1989+
f"chat_template is required for model {self.model_uid}, but none was provided."
1990+
)
1991+
19791992
if "omni" in self.model_family.model_ability:
19801993
audios, images, videos, video_kwargs = process_mm_info(
19811994
messages, use_audio_in_video=True, return_video_kwargs=True
@@ -1988,7 +2001,7 @@ async def async_chat(
19882001
)
19892002

19902003
prompt = self.get_full_context(
1991-
messages, self.model_family.chat_template, **full_context_kwargs
2004+
messages, chat_template, tokenizer=tokenizer, **full_context_kwargs
19922005
)
19932006

19942007
else:

0 commit comments

Comments
 (0)