fix: convert guided decoding schema into Harmony-native mode to avoid Harmony/JSON mode conflict for GPT-OSS

windreamer · Copilot · windreamer · commit e99191770c43 · 2026-05-07T14:31:57.000+08:00
Move the GPT-OSS guided decoding logic from api_server.py inline code into
GptOssResponseParser._convert_response_format_to_harmony(), following the
established ResponseParser pattern for model-specific request handling.

When the model architecture is GptOssForCausalLM and a structured
response_format is requested, the schema is now injected into the system
prompt as a '# Response Formats' section and response_format is cleared on
the request to avoid the conflict between Harmony-native mode and the
engine's built-in JSON/response-format mode.

In api_server.py, response_format extraction is moved after parser
instantiation so that the parser can modify the request first.

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
@@ -417,15 +417,17 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
 
     random_seed = request.seed if request.seed is not None else None
     max_new_tokens = (request.max_completion_tokens if request.max_completion_tokens else request.max_tokens)
-    response_format = None
-    if request.response_format and request.response_format.type != 'text':
-        response_format = request.response_format.model_dump()
 
     parser_cls = VariableInterface.response_parser_cls
     response_parser = parser_cls(request=request, tokenizer=tokenizer)
-    # request might be adjusted by tool parser
+    # request might be adjusted by the response parser (e.g. GPT-OSS clears
+    # response_format and injects the schema into messages instead)
     request = response_parser.request
 
+    response_format = None
+    if request.response_format and request.response_format.type != 'text':
+        response_format = request.response_format.model_dump()
+
     gen_config = GenerationConfig(
         max_new_tokens=max_new_tokens,
         do_sample=True,
diff --git a/lmdeploy/serve/parsers/_openai_harmony.py b/lmdeploy/serve/parsers/_openai_harmony.py
@@ -3,6 +3,7 @@
 available."""
 from __future__ import annotations
 
+import json
 import re
 from typing import TYPE_CHECKING
 
@@ -16,6 +17,7 @@
     FunctionCall,
     ToolCall,
 )
+from lmdeploy.utils import get_logger
 
 from .response_parser import ResponseParser, ResponseParserManager
 
@@ -24,6 +26,8 @@
 
     from lmdeploy.serve.openai.protocol import ChatCompletionRequest
 
+logger = get_logger('lmdeploy')
+
 _harmony_encoding = None
 
 
@@ -55,6 +59,7 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize
         else:
             # Unit tests may inject a lightweight sentinel request object.
             self.request = request
+        self._convert_response_format_to_harmony()
         self.model_tokenizer = tokenizer
         self.parser = StreamableParser(get_encoding(), role=Role.ASSISTANT)
         self._seen_any = False
@@ -64,6 +69,51 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize
         self._active_tool_name: str | None = None
         self.tool_parser = object()  # API server checks `is not None` for tool support.
 
+    def _convert_response_format_to_harmony(self):
+        """Convert response_format to Harmony-native mode for GPT-OSS.
+
+        GPT-OSS uses Harmony mode for structured output, which conflicts with
+        the engine's built-in JSON/response-format mode. This method injects
+        the response_format schema into the system prompt as a
+        ``# Response Formats`` section and clears ``response_format`` on the
+        request so that only the Harmony-native instructions are used.
+        """
+        fmt = getattr(self.request, 'response_format', None)
+        if fmt is None or getattr(fmt, 'type', 'text') == 'text':
+            return
+
+        try:
+            format_json = json.dumps(fmt.model_dump())
+            format_section = f'\n\n# Response Formats\n{format_json}'
+            messages = self.request.messages
+
+            if not isinstance(messages, list):
+                logger.warning('Cannot inject response_format schema into '
+                               'non-list messages for GPT-OSS; clearing response_format only.')
+                self.request = self.request.model_copy(update={'response_format': None})
+                return
+
+            new_messages = list(messages)
+            system_idx = next(
+                (i for i, msg in enumerate(new_messages) if isinstance(msg, dict) and msg.get('role') == 'system'),
+                None,
+            )
+
+            if system_idx is not None:
+                content = new_messages[system_idx].get('content') or ''
+                new_messages[system_idx] = {**new_messages[system_idx], 'content': content + format_section}
+            else:
+                new_messages.insert(0, {'role': 'system', 'content': format_section})
+
+            self.request = self.request.model_copy(update={
+                'response_format': None,
+                'messages': new_messages,
+            })
+        except Exception as e:
+            logger.error(f'Failed to convert response_format to Harmony-native mode for GPT-OSS: {e}')
+            # Still clear response_format to avoid the Harmony/JSON mode conflict
+            self.request = self.request.model_copy(update={'response_format': None})
+
     def stream_chunk(self, delta_text: str, delta_token_ids: list[int], **kwargs) -> tuple[DeltaMessage | None, bool]:
         if (
             not delta_text