From e99191770c43e71f5ab2c1715b16f2f618eb56cd Mon Sep 17 00:00:00 2001 From: windreamer Date: Thu, 7 May 2026 14:31:57 +0800 Subject: [PATCH 1/5] fix: convert guided decoding schema into Harmony-native mode to avoid Harmony/JSON mode conflict for GPT-OSS Move the GPT-OSS guided decoding logic from api_server.py inline code into GptOssResponseParser._convert_response_format_to_harmony(), following the established ResponseParser pattern for model-specific request handling. When the model architecture is GptOssForCausalLM and a structured response_format is requested, the schema is now injected into the system prompt as a '# Response Formats' section and response_format is cleared on the request to avoid the conflict between Harmony-native mode and the engine's built-in JSON/response-format mode. In api_server.py, response_format extraction is moved after parser instantiation so that the parser can modify the request first. Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- lmdeploy/serve/openai/api_server.py | 10 +++-- lmdeploy/serve/parsers/_openai_harmony.py | 50 +++++++++++++++++++++++ 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index e10f20f44e..45229a272d 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -417,15 +417,17 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque random_seed = request.seed if request.seed is not None else None max_new_tokens = (request.max_completion_tokens if request.max_completion_tokens else request.max_tokens) - response_format = None - if request.response_format and request.response_format.type != 'text': - response_format = request.response_format.model_dump() parser_cls = VariableInterface.response_parser_cls response_parser = parser_cls(request=request, tokenizer=tokenizer) - # request might be adjusted by tool parser + # request might be adjusted by the response parser (e.g. GPT-OSS clears + # response_format and injects the schema into messages instead) request = response_parser.request + response_format = None + if request.response_format and request.response_format.type != 'text': + response_format = request.response_format.model_dump() + gen_config = GenerationConfig( max_new_tokens=max_new_tokens, do_sample=True, diff --git a/lmdeploy/serve/parsers/_openai_harmony.py b/lmdeploy/serve/parsers/_openai_harmony.py index b1bb492709..045988763b 100644 --- a/lmdeploy/serve/parsers/_openai_harmony.py +++ b/lmdeploy/serve/parsers/_openai_harmony.py @@ -3,6 +3,7 @@ available.""" from __future__ import annotations +import json import re from typing import TYPE_CHECKING @@ -16,6 +17,7 @@ FunctionCall, ToolCall, ) +from lmdeploy.utils import get_logger from .response_parser import ResponseParser, ResponseParserManager @@ -24,6 +26,8 @@ from lmdeploy.serve.openai.protocol import ChatCompletionRequest +logger = get_logger('lmdeploy') + _harmony_encoding = None @@ -55,6 +59,7 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize else: # Unit tests may inject a lightweight sentinel request object. self.request = request + self._convert_response_format_to_harmony() self.model_tokenizer = tokenizer self.parser = StreamableParser(get_encoding(), role=Role.ASSISTANT) self._seen_any = False @@ -64,6 +69,51 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize self._active_tool_name: str | None = None self.tool_parser = object() # API server checks `is not None` for tool support. + def _convert_response_format_to_harmony(self): + """Convert response_format to Harmony-native mode for GPT-OSS. + + GPT-OSS uses Harmony mode for structured output, which conflicts with + the engine's built-in JSON/response-format mode. This method injects + the response_format schema into the system prompt as a + ``# Response Formats`` section and clears ``response_format`` on the + request so that only the Harmony-native instructions are used. + """ + fmt = getattr(self.request, 'response_format', None) + if fmt is None or getattr(fmt, 'type', 'text') == 'text': + return + + try: + format_json = json.dumps(fmt.model_dump()) + format_section = f'\n\n# Response Formats\n{format_json}' + messages = self.request.messages + + if not isinstance(messages, list): + logger.warning('Cannot inject response_format schema into ' + 'non-list messages for GPT-OSS; clearing response_format only.') + self.request = self.request.model_copy(update={'response_format': None}) + return + + new_messages = list(messages) + system_idx = next( + (i for i, msg in enumerate(new_messages) if isinstance(msg, dict) and msg.get('role') == 'system'), + None, + ) + + if system_idx is not None: + content = new_messages[system_idx].get('content') or '' + new_messages[system_idx] = {**new_messages[system_idx], 'content': content + format_section} + else: + new_messages.insert(0, {'role': 'system', 'content': format_section}) + + self.request = self.request.model_copy(update={ + 'response_format': None, + 'messages': new_messages, + }) + except Exception as e: + logger.error(f'Failed to convert response_format to Harmony-native mode for GPT-OSS: {e}') + # Still clear response_format to avoid the Harmony/JSON mode conflict + self.request = self.request.model_copy(update={'response_format': None}) + def stream_chunk(self, delta_text: str, delta_token_ids: list[int], **kwargs) -> tuple[DeltaMessage | None, bool]: if ( not delta_text From 0ffa417043e08310b17c7799a193086eb24b53c2 Mon Sep 17 00:00:00 2001 From: windreamer Date: Thu, 7 May 2026 14:59:18 +0800 Subject: [PATCH 2/5] fix: avoid leading blank lines in new system message & add response_format conversion tests - Build format_body without leading newlines; only prefix with \n\n when appending to an existing system message. This prevents a newly inserted system message from starting with blank lines that could interact poorly with downstream chat-template rendering. - Add TestGptOssResponseFormatHarmonyConversion test class with 5 tests: 1. response_format is cleared after conversion 2. schema appended to existing system message 3. schema inserted as new system message (no leading blank lines) 4. text-type response_format is not converted 5. no response_format leaves request unchanged --- lmdeploy/serve/parsers/_openai_harmony.py | 9 +- .../serve/parsers/test_gpt_oss_parser.py | 139 ++++++++++++++++++ 2 files changed, 145 insertions(+), 3 deletions(-) diff --git a/lmdeploy/serve/parsers/_openai_harmony.py b/lmdeploy/serve/parsers/_openai_harmony.py index 045988763b..ada0d7d4e5 100644 --- a/lmdeploy/serve/parsers/_openai_harmony.py +++ b/lmdeploy/serve/parsers/_openai_harmony.py @@ -84,7 +84,7 @@ def _convert_response_format_to_harmony(self): try: format_json = json.dumps(fmt.model_dump()) - format_section = f'\n\n# Response Formats\n{format_json}' + format_body = f'# Response Formats\n{format_json}' messages = self.request.messages if not isinstance(messages, list): @@ -101,9 +101,12 @@ def _convert_response_format_to_harmony(self): if system_idx is not None: content = new_messages[system_idx].get('content') or '' - new_messages[system_idx] = {**new_messages[system_idx], 'content': content + format_section} + new_messages[system_idx] = { + **new_messages[system_idx], + 'content': content + '\n\n' + format_body, + } else: - new_messages.insert(0, {'role': 'system', 'content': format_section}) + new_messages.insert(0, {'role': 'system', 'content': format_body}) self.request = self.request.model_copy(update={ 'response_format': None, diff --git a/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py b/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py index a47b632b28..fd0c71ab83 100644 --- a/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py +++ b/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py @@ -329,3 +329,142 @@ def test_parse_complete_appends_tool_call_still_open_at_eof(self, monkeypatch): ) def test_extract_tool_name(self, recipient, expected): assert gpt_oss_mod.GptOssResponseParser._extract_tool_name(recipient) == expected + + +class TestGptOssResponseFormatHarmonyConversion: + """Tests for + :meth:`GptOssResponseParser._convert_response_format_to_harmony`.""" + + @pytest.fixture() + def _patch_streamable_parser(self, monkeypatch): + monkeypatch.setattr( + openai_harmony_mod, + 'StreamableParser', + lambda *args, **kwargs: _FakeStreamableParser({}), + ) + + def test_response_format_cleared_after_conversion(self, monkeypatch): + """response_format must be None after the parser processes it.""" + monkeypatch.setattr( + openai_harmony_mod, + 'StreamableParser', + lambda *args, **kwargs: _FakeStreamableParser({}), + ) + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[{'role': 'user', 'content': 'hi'}], + response_format=ResponseFormat( + type='json_schema', + json_schema=JsonSchema( + name='test', + schema={'type': 'object', 'properties': {'x': {'type': 'integer'}}}, + ), + ), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + assert parser.request.response_format is None + + def test_schema_appended_to_existing_system_message(self, monkeypatch): + """When a system message already exists the schema is appended to + it.""" + monkeypatch.setattr( + openai_harmony_mod, + 'StreamableParser', + lambda *args, **kwargs: _FakeStreamableParser({}), + ) + import json as _json + + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + schema_dict = {'type': 'object', 'properties': {'x': {'type': 'integer'}}} + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[ + {'role': 'system', 'content': 'You are helpful.'}, + {'role': 'user', 'content': 'hi'}, + ], + response_format=ResponseFormat( + type='json_schema', + json_schema=JsonSchema(name='test', schema=schema_dict), + ), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + + msgs = parser.request.messages + assert msgs[0]['role'] == 'system' + assert parser.request.response_format is None + # The schema body must appear in the system message + assert '# Response Formats' in msgs[0]['content'] + assert _json.dumps(schema_dict) in msgs[0]['content'] + # The original content is preserved before the appended section + assert msgs[0]['content'].startswith('You are helpful.') + # No leading blank lines in the appended section + assert '\n\n# Response Formats' in msgs[0]['content'] + + def test_schema_inserted_as_new_system_message_when_none_exists(self, monkeypatch): + """When no system message exists a new one is inserted at position + 0.""" + monkeypatch.setattr( + openai_harmony_mod, + 'StreamableParser', + lambda *args, **kwargs: _FakeStreamableParser({}), + ) + import json as _json + + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + schema_dict = {'type': 'object', 'properties': {'name': {'type': 'string'}}} + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[{'role': 'user', 'content': 'hi'}], + response_format=ResponseFormat( + type='json_schema', + json_schema=JsonSchema(name='test', schema=schema_dict), + ), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + + msgs = parser.request.messages + assert msgs[0]['role'] == 'system' + assert parser.request.response_format is None + # New system message content must NOT start with blank lines + assert not msgs[0]['content'].startswith('\n') + assert msgs[0]['content'].startswith('# Response Formats') + assert _json.dumps(schema_dict) in msgs[0]['content'] + # The user message is still present after the inserted system message + assert msgs[1]['role'] == 'user' + + def test_text_response_format_is_not_converted(self, monkeypatch): + """A text-type response_format should be left untouched.""" + monkeypatch.setattr( + openai_harmony_mod, + 'StreamableParser', + lambda *args, **kwargs: _FakeStreamableParser({}), + ) + from lmdeploy.serve.openai.protocol import ResponseFormat + + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[{'role': 'user', 'content': 'hi'}], + response_format=ResponseFormat(type='text'), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + assert parser.request.response_format is not None + assert parser.request.response_format.type == 'text' + + def test_no_response_format_leaves_request_unchanged(self, monkeypatch): + """When response_format is None the request is not modified.""" + monkeypatch.setattr( + openai_harmony_mod, + 'StreamableParser', + lambda *args, **kwargs: _FakeStreamableParser({}), + ) + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[{'role': 'user', 'content': 'hi'}], + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + assert parser.request.response_format is None + assert len(parser.request.messages) == 1 From cb76d4352de6c6e1af04a83dc8d84416dd19cf30 Mon Sep 17 00:00:00 2001 From: windreamer Date: Thu, 7 May 2026 16:20:45 +0800 Subject: [PATCH 3/5] fix: address unresolved PR review comments 1. Guard model_copy() with hasattr check: extract _clear_response_format() helper that falls back to in-place mutation for non-Pydantic request objects (e.g. test sentinels). Prevents double-raise in the except path. 2. Use logger.exception() instead of logger.error(f'...{e}') so that stack traces are preserved in the log output. 3. Mark _patch_streamable_parser fixture as autouse=True and remove redundant monkeypatch.setattr calls from individual test methods. --- lmdeploy/serve/parsers/_openai_harmony.py | 52 ++++-- .../serve/parsers/test_gpt_oss_parser.py | 154 ++++++++++++++---- 2 files changed, 162 insertions(+), 44 deletions(-) diff --git a/lmdeploy/serve/parsers/_openai_harmony.py b/lmdeploy/serve/parsers/_openai_harmony.py index ada0d7d4e5..6957d9f3f7 100644 --- a/lmdeploy/serve/parsers/_openai_harmony.py +++ b/lmdeploy/serve/parsers/_openai_harmony.py @@ -87,10 +87,15 @@ def _convert_response_format_to_harmony(self): format_body = f'# Response Formats\n{format_json}' messages = self.request.messages + if isinstance(messages, str): + messages = messages + '\n\n' + format_body + self._clear_response_format(messages=messages) + return + if not isinstance(messages, list): logger.warning('Cannot inject response_format schema into ' 'non-list messages for GPT-OSS; clearing response_format only.') - self.request = self.request.model_copy(update={'response_format': None}) + self._clear_response_format() return new_messages = list(messages) @@ -100,22 +105,43 @@ def _convert_response_format_to_harmony(self): ) if system_idx is not None: - content = new_messages[system_idx].get('content') or '' - new_messages[system_idx] = { - **new_messages[system_idx], - 'content': content + '\n\n' + format_body, - } + content = new_messages[system_idx].get('content') + if isinstance(content, list): + # Multimodal content blocks — append a text block. + new_messages[system_idx] = { + **new_messages[system_idx], + 'content': content + [{'type': 'text', 'text': format_body}], + } + elif isinstance(content, str): + new_messages[system_idx] = { + **new_messages[system_idx], + 'content': (content + '\n\n' + format_body) if content else format_body, + } + else: + # content is None or unexpected type — insert a separate + # system message so the schema is still available. + new_messages.insert(0, {'role': 'system', 'content': format_body}) else: new_messages.insert(0, {'role': 'system', 'content': format_body}) - self.request = self.request.model_copy(update={ - 'response_format': None, - 'messages': new_messages, - }) - except Exception as e: - logger.error(f'Failed to convert response_format to Harmony-native mode for GPT-OSS: {e}') + self._clear_response_format(messages=new_messages) + except Exception: + logger.exception('Failed to convert response_format to Harmony-native mode for GPT-OSS') # Still clear response_format to avoid the Harmony/JSON mode conflict - self.request = self.request.model_copy(update={'response_format': None}) + self._clear_response_format() + + def _clear_response_format(self, messages=None): + """Clear response_format on the request, handling both Pydantic and + plain objects.""" + if hasattr(self.request, 'model_copy'): + update = {'response_format': None} + if messages is not None: + update['messages'] = messages + self.request = self.request.model_copy(update=update) + else: + self.request.response_format = None + if messages is not None: + self.request.messages = messages def stream_chunk(self, delta_text: str, delta_token_ids: list[int], **kwargs) -> tuple[DeltaMessage | None, bool]: if ( diff --git a/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py b/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py index fd0c71ab83..05bea87b7b 100644 --- a/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py +++ b/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py @@ -335,7 +335,7 @@ class TestGptOssResponseFormatHarmonyConversion: """Tests for :meth:`GptOssResponseParser._convert_response_format_to_harmony`.""" - @pytest.fixture() + @pytest.fixture(autouse=True) def _patch_streamable_parser(self, monkeypatch): monkeypatch.setattr( openai_harmony_mod, @@ -343,13 +343,8 @@ def _patch_streamable_parser(self, monkeypatch): lambda *args, **kwargs: _FakeStreamableParser({}), ) - def test_response_format_cleared_after_conversion(self, monkeypatch): + def test_response_format_cleared_after_conversion(self): """response_format must be None after the parser processes it.""" - monkeypatch.setattr( - openai_harmony_mod, - 'StreamableParser', - lambda *args, **kwargs: _FakeStreamableParser({}), - ) from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat request = ChatCompletionRequest( @@ -366,14 +361,9 @@ def test_response_format_cleared_after_conversion(self, monkeypatch): parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) assert parser.request.response_format is None - def test_schema_appended_to_existing_system_message(self, monkeypatch): + def test_schema_appended_to_existing_system_message(self): """When a system message already exists the schema is appended to it.""" - monkeypatch.setattr( - openai_harmony_mod, - 'StreamableParser', - lambda *args, **kwargs: _FakeStreamableParser({}), - ) import json as _json from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat @@ -403,14 +393,9 @@ def test_schema_appended_to_existing_system_message(self, monkeypatch): # No leading blank lines in the appended section assert '\n\n# Response Formats' in msgs[0]['content'] - def test_schema_inserted_as_new_system_message_when_none_exists(self, monkeypatch): + def test_schema_inserted_as_new_system_message_when_none_exists(self): """When no system message exists a new one is inserted at position 0.""" - monkeypatch.setattr( - openai_harmony_mod, - 'StreamableParser', - lambda *args, **kwargs: _FakeStreamableParser({}), - ) import json as _json from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat @@ -436,13 +421,8 @@ def test_schema_inserted_as_new_system_message_when_none_exists(self, monkeypatc # The user message is still present after the inserted system message assert msgs[1]['role'] == 'user' - def test_text_response_format_is_not_converted(self, monkeypatch): + def test_text_response_format_is_not_converted(self): """A text-type response_format should be left untouched.""" - monkeypatch.setattr( - openai_harmony_mod, - 'StreamableParser', - lambda *args, **kwargs: _FakeStreamableParser({}), - ) from lmdeploy.serve.openai.protocol import ResponseFormat request = ChatCompletionRequest( @@ -454,13 +434,8 @@ def test_text_response_format_is_not_converted(self, monkeypatch): assert parser.request.response_format is not None assert parser.request.response_format.type == 'text' - def test_no_response_format_leaves_request_unchanged(self, monkeypatch): + def test_no_response_format_leaves_request_unchanged(self): """When response_format is None the request is not modified.""" - monkeypatch.setattr( - openai_harmony_mod, - 'StreamableParser', - lambda *args, **kwargs: _FakeStreamableParser({}), - ) request = ChatCompletionRequest( model='openai/gpt-oss-20b', messages=[{'role': 'user', 'content': 'hi'}], @@ -468,3 +443,120 @@ def test_no_response_format_leaves_request_unchanged(self, monkeypatch): parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) assert parser.request.response_format is None assert len(parser.request.messages) == 1 + + def test_str_messages_gets_schema_appended(self): + """When messages is a string, the schema section is appended to it.""" + import json as _json + + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + schema_dict = {'type': 'object', 'properties': {'x': {'type': 'integer'}}} + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages='Tell me a joke', + response_format=ResponseFormat( + type='json_schema', + json_schema=JsonSchema(name='test', schema=schema_dict), + ), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + + assert parser.request.response_format is None + assert isinstance(parser.request.messages, str) + assert parser.request.messages.startswith('Tell me a joke') + assert '# Response Formats' in parser.request.messages + assert _json.dumps(schema_dict) in parser.request.messages + + def test_non_pydantic_request_messages_updated(self): + """Non-Pydantic sentinel requests also get messages updated.""" + import json as _json + + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + schema_dict = {'type': 'object', 'properties': {'y': {'type': 'number'}}} + fmt = ResponseFormat( + type='json_schema', + json_schema=JsonSchema(name='test', schema=schema_dict), + ) + + # Sentinel must NOT have tools/tool_choice attrs so that __init__ + # skips the Pydantic-dependent tool-rendering branch. + class _Sentinel: + messages = [{'role': 'user', 'content': 'hi'}] + response_format = fmt + + sentinel = _Sentinel() + parser = gpt_oss_mod.GptOssResponseParser(request=sentinel, tokenizer=object()) + + assert parser.request.response_format is None + msgs = parser.request.messages + assert isinstance(msgs, list) + assert msgs[0]['role'] == 'system' + assert '# Response Formats' in msgs[0]['content'] + assert _json.dumps(schema_dict) in msgs[0]['content'] + + def test_list_content_system_message_gets_text_block_appended(self): + """When system message content is a list (multimodal), append a text + block.""" + import json as _json + + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + schema_dict = {'type': 'object', 'properties': {'z': {'type': 'boolean'}}} + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[ + {'role': 'system', 'content': [ + {'type': 'text', 'text': 'You are helpful.'}, + {'type': 'image_url', 'image_url': {'url': 'http://example.com/img.png'}}, + ]}, + {'role': 'user', 'content': 'hi'}, + ], + response_format=ResponseFormat( + type='json_schema', + json_schema=JsonSchema(name='test', schema=schema_dict), + ), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + + assert parser.request.response_format is None + sys_msg = parser.request.messages[0] + assert sys_msg['role'] == 'system' + content = sys_msg['content'] + assert isinstance(content, list) + assert len(content) == 3 + # Original two blocks preserved + assert content[0]['type'] == 'text' + assert content[0]['text'] == 'You are helpful.' + assert content[1]['type'] == 'image_url' + # Schema appended as a text block + assert content[2]['type'] == 'text' + assert '# Response Formats' in content[2]['text'] + assert _json.dumps(schema_dict) in content[2]['text'] + + def test_none_content_system_message_inserts_separate_system(self): + """When system message content is None, insert a new system message.""" + import json as _json + + from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat + + schema_dict = {'type': 'object', 'properties': {'w': {'type': 'string'}}} + request = ChatCompletionRequest( + model='openai/gpt-oss-20b', + messages=[ + {'role': 'system', 'content': None}, + {'role': 'user', 'content': 'hi'}, + ], + response_format=ResponseFormat( + type='json_schema', + json_schema=JsonSchema(name='test', schema=schema_dict), + ), + ) + parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) + + assert parser.request.response_format is None + msgs = parser.request.messages + # A new system message with the schema is inserted at position 0 + assert msgs[0]['role'] == 'system' + assert '# Response Formats' in msgs[0]['content'] + assert _json.dumps(schema_dict) in msgs[0]['content'] From 61892e54ea1e9af53485482a19dc4de495e85fe0 Mon Sep 17 00:00:00 2001 From: windreamer Date: Tue, 12 May 2026 20:24:08 +0800 Subject: [PATCH 4/5] refactor: move request normalization into parser system Move stop, response_format, and max_completion_tokens normalization out of api_server.py into normalize_chat_request() in the parser module. Both BaseResponseParser and GptOssResponseParser call this after their own request adjustments, ensuring parser.request is always normalized. --- lmdeploy/serve/openai/api_server.py | 16 +++--------- lmdeploy/serve/parsers/_openai_harmony.py | 3 ++- lmdeploy/serve/parsers/response_parser.py | 31 +++++++++++++++++++++++ 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index 45229a272d..804bbf87d4 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -393,9 +393,6 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque request_id = str(session.session_id) created_time = int(time.time()) - if isinstance(request.stop, str): - request.stop = [request.stop] - tokenizer = VariableInterface.async_engine.tokenizer.model.model gen_logprobs, logits_processors = None, None if request.logprobs and request.top_logprobs: @@ -416,20 +413,15 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque 'Please launch the api_server with --tool-call-parser if you want to use tool.') random_seed = request.seed if request.seed is not None else None - max_new_tokens = (request.max_completion_tokens if request.max_completion_tokens else request.max_tokens) parser_cls = VariableInterface.response_parser_cls response_parser = parser_cls(request=request, tokenizer=tokenizer) - # request might be adjusted by the response parser (e.g. GPT-OSS clears - # response_format and injects the schema into messages instead) + # request is normalized and may be adjusted by the parser + # (e.g. GPT-OSS clears response_format and injects the schema into messages) request = response_parser.request - response_format = None - if request.response_format and request.response_format.type != 'text': - response_format = request.response_format.model_dump() - gen_config = GenerationConfig( - max_new_tokens=max_new_tokens, + max_new_tokens=request.max_completion_tokens, do_sample=True, logprobs=gen_logprobs, top_k=request.top_k, @@ -440,7 +432,7 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque stop_words=request.stop, include_stop_str_in_output=request.include_stop_str_in_output, skip_special_tokens=request.skip_special_tokens, - response_format=response_format, + response_format=request.response_format, logits_processors=logits_processors, min_new_tokens=request.min_new_tokens, min_p=request.min_p, diff --git a/lmdeploy/serve/parsers/_openai_harmony.py b/lmdeploy/serve/parsers/_openai_harmony.py index 6957d9f3f7..9de071dcdd 100644 --- a/lmdeploy/serve/parsers/_openai_harmony.py +++ b/lmdeploy/serve/parsers/_openai_harmony.py @@ -19,7 +19,7 @@ ) from lmdeploy.utils import get_logger -from .response_parser import ResponseParser, ResponseParserManager +from .response_parser import ResponseParser, ResponseParserManager, normalize_chat_request if TYPE_CHECKING: from transformers import PreTrainedTokenizerBase @@ -60,6 +60,7 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize # Unit tests may inject a lightweight sentinel request object. self.request = request self._convert_response_format_to_harmony() + self.request = normalize_chat_request(self.request) self.model_tokenizer = tokenizer self.parser = StreamableParser(get_encoding(), role=Role.ASSISTANT) self._seen_any = False diff --git a/lmdeploy/serve/parsers/response_parser.py b/lmdeploy/serve/parsers/response_parser.py index ba75e2db2b..8b0cc8dc54 100644 --- a/lmdeploy/serve/parsers/response_parser.py +++ b/lmdeploy/serve/parsers/response_parser.py @@ -25,6 +25,34 @@ ResponseParserManager = Registry('response_parser', locations=['lmdeploy.serve.parsers.response_parser']) +def normalize_chat_request(request: ChatCompletionRequest) -> ChatCompletionRequest: + """Normalize a ChatCompletionRequest for downstream consumption. + + Normalizes (via ``model_copy``): + - ``response_format``: ``ResponseFormat → dict``, ``type='text' → None`` + - ``stop``: ``str → list[str]`` + - ``max_completion_tokens``: resolves from deprecated ``max_tokens`` + """ + updates: dict = {} + + fmt = request.response_format + if fmt is not None and fmt.type != 'text': + updates['response_format'] = fmt.model_dump() + elif fmt is not None and fmt.type == 'text': + updates['response_format'] = None + + if isinstance(request.stop, str): + updates['stop'] = [request.stop] + + if request.max_completion_tokens is None and request.max_tokens is not None: + updates['max_completion_tokens'] = request.max_tokens + + if updates: + request = request.model_copy(update=updates) + + return request + + class ResponseParser: @classmethod def set_parsers(cls, reasoning_parser_name: str | None = None, tool_parser_name: str | None = None) -> None: @@ -162,6 +190,9 @@ def __init__( self.request = self.tool_parser.adjust_request(request) else: self.request = self.dump_tools(request) + + self.request = normalize_chat_request(self.request) + self._accumulated_text = '' self.profile = self._build_profile() From 1fb944a1ddeee21ab5e6ef4e7b4e606910de211b Mon Sep 17 00:00:00 2001 From: windreamer Date: Wed, 13 May 2026 11:05:14 +0800 Subject: [PATCH 5/5] refactor: simplify normalize_chat_request getattr and add model_copy guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add model_copy guard for non-Pydantic objects - Use direct attribute access for stable fields, keep getattr only for deprecated max_tokens - Update gpt_oss test to match text response_format→None normalization --- lmdeploy/serve/parsers/response_parser.py | 10 +++++++--- .../test_lmdeploy/serve/parsers/test_gpt_oss_parser.py | 6 ++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/lmdeploy/serve/parsers/response_parser.py b/lmdeploy/serve/parsers/response_parser.py index 8b0cc8dc54..293c107427 100644 --- a/lmdeploy/serve/parsers/response_parser.py +++ b/lmdeploy/serve/parsers/response_parser.py @@ -28,11 +28,13 @@ def normalize_chat_request(request: ChatCompletionRequest) -> ChatCompletionRequest: """Normalize a ChatCompletionRequest for downstream consumption. - Normalizes (via ``model_copy``): - ``response_format``: ``ResponseFormat → dict``, ``type='text' → None`` - ``stop``: ``str → list[str]`` - ``max_completion_tokens``: resolves from deprecated ``max_tokens`` """ + if not hasattr(request, 'model_copy'): + return request + updates: dict = {} fmt = request.response_format @@ -44,8 +46,10 @@ def normalize_chat_request(request: ChatCompletionRequest) -> ChatCompletionRequ if isinstance(request.stop, str): updates['stop'] = [request.stop] - if request.max_completion_tokens is None and request.max_tokens is not None: - updates['max_completion_tokens'] = request.max_tokens + if request.max_completion_tokens is None: + max_tokens = getattr(request, 'max_tokens', None) + if max_tokens is not None: + updates['max_completion_tokens'] = max_tokens if updates: request = request.model_copy(update=updates) diff --git a/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py b/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py index 05bea87b7b..16001ef722 100644 --- a/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py +++ b/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py @@ -421,8 +421,7 @@ def test_schema_inserted_as_new_system_message_when_none_exists(self): # The user message is still present after the inserted system message assert msgs[1]['role'] == 'user' - def test_text_response_format_is_not_converted(self): - """A text-type response_format should be left untouched.""" + def test_text_response_format_is_cleared_by_normalize(self): from lmdeploy.serve.openai.protocol import ResponseFormat request = ChatCompletionRequest( @@ -431,8 +430,7 @@ def test_text_response_format_is_not_converted(self): response_format=ResponseFormat(type='text'), ) parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object()) - assert parser.request.response_format is not None - assert parser.request.response_format.type == 'text' + assert parser.request.response_format is None def test_no_response_format_leaves_request_unchanged(self): """When response_format is None the request is not modified."""