Skip to content

Commit e991917

Browse files
windreamerCopilot
andcommitted
fix: convert guided decoding schema into Harmony-native mode to avoid Harmony/JSON mode conflict for GPT-OSS
Move the GPT-OSS guided decoding logic from api_server.py inline code into GptOssResponseParser._convert_response_format_to_harmony(), following the established ResponseParser pattern for model-specific request handling. When the model architecture is GptOssForCausalLM and a structured response_format is requested, the schema is now injected into the system prompt as a '# Response Formats' section and response_format is cleared on the request to avoid the conflict between Harmony-native mode and the engine's built-in JSON/response-format mode. In api_server.py, response_format extraction is moved after parser instantiation so that the parser can modify the request first. Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 81be529 commit e991917

2 files changed

Lines changed: 56 additions & 4 deletions

File tree

lmdeploy/serve/openai/api_server.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -417,15 +417,17 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
417417

418418
random_seed = request.seed if request.seed is not None else None
419419
max_new_tokens = (request.max_completion_tokens if request.max_completion_tokens else request.max_tokens)
420-
response_format = None
421-
if request.response_format and request.response_format.type != 'text':
422-
response_format = request.response_format.model_dump()
423420

424421
parser_cls = VariableInterface.response_parser_cls
425422
response_parser = parser_cls(request=request, tokenizer=tokenizer)
426-
# request might be adjusted by tool parser
423+
# request might be adjusted by the response parser (e.g. GPT-OSS clears
424+
# response_format and injects the schema into messages instead)
427425
request = response_parser.request
428426

427+
response_format = None
428+
if request.response_format and request.response_format.type != 'text':
429+
response_format = request.response_format.model_dump()
430+
429431
gen_config = GenerationConfig(
430432
max_new_tokens=max_new_tokens,
431433
do_sample=True,

lmdeploy/serve/parsers/_openai_harmony.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
available."""
44
from __future__ import annotations
55

6+
import json
67
import re
78
from typing import TYPE_CHECKING
89

@@ -16,6 +17,7 @@
1617
FunctionCall,
1718
ToolCall,
1819
)
20+
from lmdeploy.utils import get_logger
1921

2022
from .response_parser import ResponseParser, ResponseParserManager
2123

@@ -24,6 +26,8 @@
2426

2527
from lmdeploy.serve.openai.protocol import ChatCompletionRequest
2628

29+
logger = get_logger('lmdeploy')
30+
2731
_harmony_encoding = None
2832

2933

@@ -55,6 +59,7 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize
5559
else:
5660
# Unit tests may inject a lightweight sentinel request object.
5761
self.request = request
62+
self._convert_response_format_to_harmony()
5863
self.model_tokenizer = tokenizer
5964
self.parser = StreamableParser(get_encoding(), role=Role.ASSISTANT)
6065
self._seen_any = False
@@ -64,6 +69,51 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize
6469
self._active_tool_name: str | None = None
6570
self.tool_parser = object() # API server checks `is not None` for tool support.
6671

72+
def _convert_response_format_to_harmony(self):
73+
"""Convert response_format to Harmony-native mode for GPT-OSS.
74+
75+
GPT-OSS uses Harmony mode for structured output, which conflicts with
76+
the engine's built-in JSON/response-format mode. This method injects
77+
the response_format schema into the system prompt as a
78+
``# Response Formats`` section and clears ``response_format`` on the
79+
request so that only the Harmony-native instructions are used.
80+
"""
81+
fmt = getattr(self.request, 'response_format', None)
82+
if fmt is None or getattr(fmt, 'type', 'text') == 'text':
83+
return
84+
85+
try:
86+
format_json = json.dumps(fmt.model_dump())
87+
format_section = f'\n\n# Response Formats\n{format_json}'
88+
messages = self.request.messages
89+
90+
if not isinstance(messages, list):
91+
logger.warning('Cannot inject response_format schema into '
92+
'non-list messages for GPT-OSS; clearing response_format only.')
93+
self.request = self.request.model_copy(update={'response_format': None})
94+
return
95+
96+
new_messages = list(messages)
97+
system_idx = next(
98+
(i for i, msg in enumerate(new_messages) if isinstance(msg, dict) and msg.get('role') == 'system'),
99+
None,
100+
)
101+
102+
if system_idx is not None:
103+
content = new_messages[system_idx].get('content') or ''
104+
new_messages[system_idx] = {**new_messages[system_idx], 'content': content + format_section}
105+
else:
106+
new_messages.insert(0, {'role': 'system', 'content': format_section})
107+
108+
self.request = self.request.model_copy(update={
109+
'response_format': None,
110+
'messages': new_messages,
111+
})
112+
except Exception as e:
113+
logger.error(f'Failed to convert response_format to Harmony-native mode for GPT-OSS: {e}')
114+
# Still clear response_format to avoid the Harmony/JSON mode conflict
115+
self.request = self.request.model_copy(update={'response_format': None})
116+
67117
def stream_chunk(self, delta_text: str, delta_token_ids: list[int], **kwargs) -> tuple[DeltaMessage | None, bool]:
68118
if (
69119
not delta_text

0 commit comments

Comments
 (0)