diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index e10f20f44e..804bbf87d4 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -393,9 +393,6 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque request_id = str(session.session_id) created_time = int(time.time()) - if isinstance(request.stop, str): - request.stop = [request.stop] - tokenizer = VariableInterface.async_engine.tokenizer.model.model gen_logprobs, logits_processors = None, None if request.logprobs and request.top_logprobs: @@ -416,18 +413,15 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque 'Please launch the api_server with --tool-call-parser if you want to use tool.') random_seed = request.seed if request.seed is not None else None - max_new_tokens = (request.max_completion_tokens if request.max_completion_tokens else request.max_tokens) - response_format = None - if request.response_format and request.response_format.type != 'text': - response_format = request.response_format.model_dump() parser_cls = VariableInterface.response_parser_cls response_parser = parser_cls(request=request, tokenizer=tokenizer) - # request might be adjusted by tool parser + # request is normalized and may be adjusted by the parser + # (e.g. GPT-OSS clears response_format and injects the schema into messages) request = response_parser.request gen_config = GenerationConfig( - max_new_tokens=max_new_tokens, + max_new_tokens=request.max_completion_tokens, do_sample=True, logprobs=gen_logprobs, top_k=request.top_k, @@ -438,7 +432,7 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque stop_words=request.stop, include_stop_str_in_output=request.include_stop_str_in_output, skip_special_tokens=request.skip_special_tokens, - response_format=response_format, + response_format=request.response_format, logits_processors=logits_processors, min_new_tokens=request.min_new_tokens, min_p=request.min_p, diff --git a/lmdeploy/serve/parsers/_openai_harmony.py b/lmdeploy/serve/parsers/_openai_harmony.py index b1bb492709..9de071dcdd 100644 --- a/lmdeploy/serve/parsers/_openai_harmony.py +++ b/lmdeploy/serve/parsers/_openai_harmony.py @@ -3,6 +3,7 @@ available.""" from __future__ import annotations +import json import re from typing import TYPE_CHECKING @@ -16,14 +17,17 @@ FunctionCall, ToolCall, ) +from lmdeploy.utils import get_logger -from .response_parser import ResponseParser, ResponseParserManager +from .response_parser import ResponseParser, ResponseParserManager, normalize_chat_request if TYPE_CHECKING: from transformers import PreTrainedTokenizerBase from lmdeploy.serve.openai.protocol import ChatCompletionRequest +logger = get_logger('lmdeploy') + _harmony_encoding = None @@ -55,6 +59,8 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize else: # Unit tests may inject a lightweight sentinel request object. self.request = request + self._convert_response_format_to_harmony() + self.request = normalize_chat_request(self.request) self.model_tokenizer = tokenizer self.parser = StreamableParser(get_encoding(), role=Role.ASSISTANT) self._seen_any = False @@ -64,6 +70,80 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize self._active_tool_name: str | None = None self.tool_parser = object() # API server checks `is not None` for tool support. + def _convert_response_format_to_harmony(self): + """Convert response_format to Harmony-native mode for GPT-OSS. + + GPT-OSS uses Harmony mode for structured output, which conflicts with + the engine's built-in JSON/response-format mode. This method injects + the response_format schema into the system prompt as a + ``# Response Formats`` section and clears ``response_format`` on the + request so that only the Harmony-native instructions are used. + """ + fmt = getattr(self.request, 'response_format', None) + if fmt is None or getattr(fmt, 'type', 'text') == 'text': + return + + try: + format_json = json.dumps(fmt.model_dump()) + format_body = f'# Response Formats\n{format_json}' + messages = self.request.messages + + if isinstance(messages, str): + messages = messages + '\n\n' + format_body + self._clear_response_format(messages=messages) + return + + if not isinstance(messages, list): + logger.warning('Cannot inject response_format schema into ' + 'non-list messages for GPT-OSS; clearing response_format only.') + self._clear_response_format() + return + + new_messages = list(messages) + system_idx = next( + (i for i, msg in enumerate(new_messages) if isinstance(msg, dict) and msg.get('role') == 'system'), + None, + ) + + if system_idx is not None: + content = new_messages[system_idx].get('content') + if isinstance(content, list): + # Multimodal content blocks — append a text block. + new_messages[system_idx] = { + **new_messages[system_idx], + 'content': content + [{'type': 'text', 'text': format_body}], + } + elif isinstance(content, str): + new_messages[system_idx] = { + **new_messages[system_idx], + 'content': (content + '\n\n' + format_body) if content else format_body, + } + else: + # content is None or unexpected type — insert a separate + # system message so the schema is still available. + new_messages.insert(0, {'role': 'system', 'content': format_body}) + else: + new_messages.insert(0, {'role': 'system', 'content': format_body}) + + self._clear_response_format(messages=new_messages) + except Exception: + logger.exception('Failed to convert response_format to Harmony-native mode for GPT-OSS') + # Still clear response_format to avoid the Harmony/JSON mode conflict + self._clear_response_format() + + def _clear_response_format(self, messages=None): + """Clear response_format on the request, handling both Pydantic and + plain objects.""" + if hasattr(self.request, 'model_copy'): + update = {'response_format': None} + if messages is not None: + update['messages'] = messages + self.request = self.request.model_copy(update=update) + else: + self.request.response_format = None + if messages is not None: + self.request.messages = messages + def stream_chunk(self, delta_text: str, delta_token_ids: list[int], **kwargs) -> tuple[DeltaMessage | None, bool]: if ( not delta_text diff --git a/lmdeploy/serve/parsers/response_parser.py b/lmdeploy/serve/parsers/response_parser.py index ba75e2db2b..293c107427 100644 --- a/lmdeploy/serve/parsers/response_parser.py +++ b/lmdeploy/serve/parsers/response_parser.py @@ -25,6 +25,38 @@ ResponseParserManager = Registry('response_parser', locations=['lmdeploy.serve.parsers.response_parser']) +def normalize_chat_request(request: ChatCompletionRequest) -> ChatCompletionRequest: + """Normalize a ChatCompletionRequest for downstream consumption. + + - ``response_format``: ``ResponseFormat → dict``, ``type='text' → None`` + - ``stop``: ``str → list[str]`` + - ``max_completion_tokens``: resolves from deprecated ``max_tokens`` + """ + if not hasattr(request, 'model_copy'): + return request + + updates: dict = {} + + fmt = request.response_format + if fmt is not None and fmt.type != 'text': + updates['response_format'] = fmt.model_dump() + elif fmt is not None and fmt.type == 'text': + updates['response_format'] = None + + if isinstance(request.stop, str): + updates['stop'] = [request.stop] + + if request.max_completion_tokens is None: + max_tokens = getattr(request, 'max_tokens', None) + if max_tokens is not None: + updates['max_completion_tokens'] = max_tokens + + if updates: + request = request.model_copy(update=updates) + + return request + + class ResponseParser: @classmethod def set_parsers(cls, reasoning_parser_name: str | None = None, tool_parser_name: str | None = None) -> None: @@ -162,6 +194,9 @@ def __init__( self.request = self.tool_parser.adjust_request(request) else: self.request = self.dump_tools(request) + + self.request = normalize_chat_request(self.request) + self._accumulated_text = '' self.profile = self._build_profile() diff --git a/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py b/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py index a47b632b28..16001ef722 100644 --- a/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py +++ b/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py @@ -329,3 +329,232 @@ def test_parse_complete_appends_tool_call_still_open_at_eof(self, monkeypatch): ) def test_extract_tool_name(self, recipient, expected): assert gpt_oss_mod.GptOssResponseParser._extract_tool_name(recipient) == expected + + +class TestGptOssResponseFormatHarmonyConversion: + """Tests for + :meth:`GptOssResponseParser._convert_response_format_to_harmony`.""" + + @pytest.fixture(autouse=True) + def _patch_streamable_parser(self, monkeypatch): + monkeypatch.setattr( + openai_harmony_mod, + 'StreamableParser', + lambda *args, **kwargs: _FakeStreamableParser({}), + ) + + def test_response_format_cleared_after_conversion(self): + """response_format must be None after the parser processes it.""" + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[{'role': 'user', 'content': 'hi'}], + response_format=ResponseFormat( + type='json_schema', + json_schema=JsonSchema( + name='test', + schema={'type': 'object', 'properties': {'x': {'type': 'integer'}}}, + ), + ), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + assert parser.request.response_format is None + + def test_schema_appended_to_existing_system_message(self): + """When a system message already exists the schema is appended to + it.""" + import json as _json + + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + schema_dict = {'type': 'object', 'properties': {'x': {'type': 'integer'}}} + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[ + {'role': 'system', 'content': 'You are helpful.'}, + {'role': 'user', 'content': 'hi'}, + ], + response_format=ResponseFormat( + type='json_schema', + json_schema=JsonSchema(name='test', schema=schema_dict), + ), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + + msgs = parser.request.messages + assert msgs[0]['role'] == 'system' + assert parser.request.response_format is None + # The schema body must appear in the system message + assert '# Response Formats' in msgs[0]['content'] + assert _json.dumps(schema_dict) in msgs[0]['content'] + # The original content is preserved before the appended section + assert msgs[0]['content'].startswith('You are helpful.') + # No leading blank lines in the appended section + assert '\n\n# Response Formats' in msgs[0]['content'] + + def test_schema_inserted_as_new_system_message_when_none_exists(self): + """When no system message exists a new one is inserted at position + 0.""" + import json as _json + + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + schema_dict = {'type': 'object', 'properties': {'name': {'type': 'string'}}} + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[{'role': 'user', 'content': 'hi'}], + response_format=ResponseFormat( + type='json_schema', + json_schema=JsonSchema(name='test', schema=schema_dict), + ), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + + msgs = parser.request.messages + assert msgs[0]['role'] == 'system' + assert parser.request.response_format is None + # New system message content must NOT start with blank lines + assert not msgs[0]['content'].startswith('\n') + assert msgs[0]['content'].startswith('# Response Formats') + assert _json.dumps(schema_dict) in msgs[0]['content'] + # The user message is still present after the inserted system message + assert msgs[1]['role'] == 'user' + + def test_text_response_format_is_cleared_by_normalize(self): + from lmdeploy.serve.openai.protocol import ResponseFormat + + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[{'role': 'user', 'content': 'hi'}], + response_format=ResponseFormat(type='text'), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + assert parser.request.response_format is None + + def test_no_response_format_leaves_request_unchanged(self): + """When response_format is None the request is not modified.""" + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[{'role': 'user', 'content': 'hi'}], + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + assert parser.request.response_format is None + assert len(parser.request.messages) == 1 + + def test_str_messages_gets_schema_appended(self): + """When messages is a string, the schema section is appended to it.""" + import json as _json + + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + schema_dict = {'type': 'object', 'properties': {'x': {'type': 'integer'}}} + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages='Tell me a joke', + response_format=ResponseFormat( + type='json_schema', + json_schema=JsonSchema(name='test', schema=schema_dict), + ), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + + assert parser.request.response_format is None + assert isinstance(parser.request.messages, str) + assert parser.request.messages.startswith('Tell me a joke') + assert '# Response Formats' in parser.request.messages + assert _json.dumps(schema_dict) in parser.request.messages + + def test_non_pydantic_request_messages_updated(self): + """Non-Pydantic sentinel requests also get messages updated.""" + import json as _json + + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + schema_dict = {'type': 'object', 'properties': {'y': {'type': 'number'}}} + fmt = ResponseFormat( + type='json_schema', + json_schema=JsonSchema(name='test', schema=schema_dict), + ) + + # Sentinel must NOT have tools/tool_choice attrs so that __init__ + # skips the Pydantic-dependent tool-rendering branch. + class _Sentinel: + messages = [{'role': 'user', 'content': 'hi'}] + response_format = fmt + + sentinel = _Sentinel() + parser = gpt_oss_mod.GptOssResponseParser(request=sentinel, tokenizer=object()) + + assert parser.request.response_format is None + msgs = parser.request.messages + assert isinstance(msgs, list) + assert msgs[0]['role'] == 'system' + assert '# Response Formats' in msgs[0]['content'] + assert _json.dumps(schema_dict) in msgs[0]['content'] + + def test_list_content_system_message_gets_text_block_appended(self): + """When system message content is a list (multimodal), append a text + block.""" + import json as _json + + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + schema_dict = {'type': 'object', 'properties': {'z': {'type': 'boolean'}}} + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[ + {'role': 'system', 'content': [ + {'type': 'text', 'text': 'You are helpful.'}, + {'type': 'image_url', 'image_url': {'url': 'http://example.com/img.png'}}, + ]}, + {'role': 'user', 'content': 'hi'}, + ], + response_format=ResponseFormat( + type='json_schema', + json_schema=JsonSchema(name='test', schema=schema_dict), + ), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + + assert parser.request.response_format is None + sys_msg = parser.request.messages[0] + assert sys_msg['role'] == 'system' + content = sys_msg['content'] + assert isinstance(content, list) + assert len(content) == 3 + # Original two blocks preserved + assert content[0]['type'] == 'text' + assert content[0]['text'] == 'You are helpful.' + assert content[1]['type'] == 'image_url' + # Schema appended as a text block + assert content[2]['type'] == 'text' + assert '# Response Formats' in content[2]['text'] + assert _json.dumps(schema_dict) in content[2]['text'] + + def test_none_content_system_message_inserts_separate_system(self): + """When system message content is None, insert a new system message.""" + import json as _json + + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + schema_dict = {'type': 'object', 'properties': {'w': {'type': 'string'}}} + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[ + {'role': 'system', 'content': None}, + {'role': 'user', 'content': 'hi'}, + ], + response_format=ResponseFormat( + type='json_schema', + json_schema=JsonSchema(name='test', schema=schema_dict), + ), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + + assert parser.request.response_format is None + msgs = parser.request.messages + # A new system message with the schema is inserted at position 0 + assert msgs[0]['role'] == 'system' + assert '# Response Formats' in msgs[0]['content'] + assert _json.dumps(schema_dict) in msgs[0]['content']