diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index e10f20f44e..804bbf87d4 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -393,9 +393,6 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
     request_id = str(session.session_id)
     created_time = int(time.time())
 
-    if isinstance(request.stop, str):
-        request.stop = [request.stop]
-
     tokenizer = VariableInterface.async_engine.tokenizer.model.model
     gen_logprobs, logits_processors = None, None
     if request.logprobs and request.top_logprobs:
@@ -416,18 +413,15 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
                 'Please launch the api_server with --tool-call-parser if you want to use tool.')
 
     random_seed = request.seed if request.seed is not None else None
-    max_new_tokens = (request.max_completion_tokens if request.max_completion_tokens else request.max_tokens)
-    response_format = None
-    if request.response_format and request.response_format.type != 'text':
-        response_format = request.response_format.model_dump()
 
     parser_cls = VariableInterface.response_parser_cls
     response_parser = parser_cls(request=request, tokenizer=tokenizer)
-    # request might be adjusted by tool parser
+    # request is normalized and may be adjusted by the parser
+    # (e.g. GPT-OSS clears response_format and injects the schema into messages)
     request = response_parser.request
 
     gen_config = GenerationConfig(
-        max_new_tokens=max_new_tokens,
+        max_new_tokens=request.max_completion_tokens,
         do_sample=True,
         logprobs=gen_logprobs,
         top_k=request.top_k,
@@ -438,7 +432,7 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
         stop_words=request.stop,
         include_stop_str_in_output=request.include_stop_str_in_output,
         skip_special_tokens=request.skip_special_tokens,
-        response_format=response_format,
+        response_format=request.response_format,
         logits_processors=logits_processors,
         min_new_tokens=request.min_new_tokens,
         min_p=request.min_p,
diff --git a/lmdeploy/serve/parsers/_openai_harmony.py b/lmdeploy/serve/parsers/_openai_harmony.py
index b1bb492709..9de071dcdd 100644
--- a/lmdeploy/serve/parsers/_openai_harmony.py
+++ b/lmdeploy/serve/parsers/_openai_harmony.py
@@ -3,6 +3,7 @@
 available."""
 from __future__ import annotations
 
+import json
 import re
 from typing import TYPE_CHECKING
 
@@ -16,14 +17,17 @@
     FunctionCall,
     ToolCall,
 )
+from lmdeploy.utils import get_logger
 
-from .response_parser import ResponseParser, ResponseParserManager
+from .response_parser import ResponseParser, ResponseParserManager, normalize_chat_request
 
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizerBase
 
     from lmdeploy.serve.openai.protocol import ChatCompletionRequest
 
+logger = get_logger('lmdeploy')
+
 _harmony_encoding = None
 
 
@@ -55,6 +59,8 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize
         else:
             # Unit tests may inject a lightweight sentinel request object.
             self.request = request
+        self._convert_response_format_to_harmony()
+        self.request = normalize_chat_request(self.request)
         self.model_tokenizer = tokenizer
         self.parser = StreamableParser(get_encoding(), role=Role.ASSISTANT)
         self._seen_any = False
@@ -64,6 +70,80 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize
         self._active_tool_name: str | None = None
         self.tool_parser = object()  # API server checks `is not None` for tool support.
 
+    def _convert_response_format_to_harmony(self):
+        """Convert response_format to Harmony-native mode for GPT-OSS.
+
+        GPT-OSS uses Harmony mode for structured output, which conflicts with
+        the engine's built-in JSON/response-format mode. This method injects
+        the response_format schema into the system prompt as a
+        ``# Response Formats`` section and clears ``response_format`` on the
+        request so that only the Harmony-native instructions are used.
+        """
+        fmt = getattr(self.request, 'response_format', None)
+        if fmt is None or getattr(fmt, 'type', 'text') == 'text':
+            return
+
+        try:
+            format_json = json.dumps(fmt.model_dump())
+            format_body = f'# Response Formats\n{format_json}'
+            messages = self.request.messages
+
+            if isinstance(messages, str):
+                messages = messages + '\n\n' + format_body
+                self._clear_response_format(messages=messages)
+                return
+
+            if not isinstance(messages, list):
+                logger.warning('Cannot inject response_format schema into '
+                               'non-list messages for GPT-OSS; clearing response_format only.')
+                self._clear_response_format()
+                return
+
+            new_messages = list(messages)
+            system_idx = next(
+                (i for i, msg in enumerate(new_messages) if isinstance(msg, dict) and msg.get('role') == 'system'),
+                None,
+            )
+
+            if system_idx is not None:
+                content = new_messages[system_idx].get('content')
+                if isinstance(content, list):
+                    # Multimodal content blocks — append a text block.
+                    new_messages[system_idx] = {
+                        **new_messages[system_idx],
+                        'content': content + [{'type': 'text', 'text': format_body}],
+                    }
+                elif isinstance(content, str):
+                    new_messages[system_idx] = {
+                        **new_messages[system_idx],
+                        'content': (content + '\n\n' + format_body) if content else format_body,
+                    }
+                else:
+                    # content is None or unexpected type — insert a separate
+                    # system message so the schema is still available.
+                    new_messages.insert(0, {'role': 'system', 'content': format_body})
+            else:
+                new_messages.insert(0, {'role': 'system', 'content': format_body})
+
+            self._clear_response_format(messages=new_messages)
+        except Exception:
+            logger.exception('Failed to convert response_format to Harmony-native mode for GPT-OSS')
+            # Still clear response_format to avoid the Harmony/JSON mode conflict
+            self._clear_response_format()
+
+    def _clear_response_format(self, messages=None):
+        """Clear response_format on the request, handling both Pydantic and
+        plain objects."""
+        if hasattr(self.request, 'model_copy'):
+            update = {'response_format': None}
+            if messages is not None:
+                update['messages'] = messages
+            self.request = self.request.model_copy(update=update)
+        else:
+            self.request.response_format = None
+            if messages is not None:
+                self.request.messages = messages
+
     def stream_chunk(self, delta_text: str, delta_token_ids: list[int], **kwargs) -> tuple[DeltaMessage | None, bool]:
         if (
             not delta_text
diff --git a/lmdeploy/serve/parsers/response_parser.py b/lmdeploy/serve/parsers/response_parser.py
index ba75e2db2b..293c107427 100644
--- a/lmdeploy/serve/parsers/response_parser.py
+++ b/lmdeploy/serve/parsers/response_parser.py
@@ -25,6 +25,38 @@
 ResponseParserManager = Registry('response_parser', locations=['lmdeploy.serve.parsers.response_parser'])
 
 
+def normalize_chat_request(request: ChatCompletionRequest) -> ChatCompletionRequest:
+    """Normalize a ChatCompletionRequest for downstream consumption.
+
+    - ``response_format``: ``ResponseFormat → dict``, ``type='text' → None``
+    - ``stop``: ``str → list[str]``
+    - ``max_completion_tokens``: resolves from deprecated ``max_tokens``
+    """
+    if not hasattr(request, 'model_copy'):
+        return request
+
+    updates: dict = {}
+
+    fmt = request.response_format
+    if fmt is not None and fmt.type != 'text':
+        updates['response_format'] = fmt.model_dump()
+    elif fmt is not None and fmt.type == 'text':
+        updates['response_format'] = None
+
+    if isinstance(request.stop, str):
+        updates['stop'] = [request.stop]
+
+    if request.max_completion_tokens is None:
+        max_tokens = getattr(request, 'max_tokens', None)
+        if max_tokens is not None:
+            updates['max_completion_tokens'] = max_tokens
+
+    if updates:
+        request = request.model_copy(update=updates)
+
+    return request
+
+
 class ResponseParser:
     @classmethod
     def set_parsers(cls, reasoning_parser_name: str | None = None, tool_parser_name: str | None = None) -> None:
@@ -162,6 +194,9 @@ def __init__(
             self.request = self.tool_parser.adjust_request(request)
         else:
             self.request = self.dump_tools(request)
+
+        self.request = normalize_chat_request(self.request)
+
         self._accumulated_text = ''
 
         self.profile = self._build_profile()
diff --git a/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py b/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py
index a47b632b28..16001ef722 100644
--- a/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py
+++ b/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py
@@ -329,3 +329,232 @@ def test_parse_complete_appends_tool_call_still_open_at_eof(self, monkeypatch):
     )
     def test_extract_tool_name(self, recipient, expected):
         assert gpt_oss_mod.GptOssResponseParser._extract_tool_name(recipient) == expected
+
+
+class TestGptOssResponseFormatHarmonyConversion:
+    """Tests for
+    :meth:`GptOssResponseParser._convert_response_format_to_harmony`."""
+
+    @pytest.fixture(autouse=True)
+    def _patch_streamable_parser(self, monkeypatch):
+        monkeypatch.setattr(
+            openai_harmony_mod,
+            'StreamableParser',
+            lambda *args, **kwargs: _FakeStreamableParser({}),
+        )
+
+    def test_response_format_cleared_after_conversion(self):
+        """response_format must be None after the parser processes it."""
+        from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
+
+        request = ChatCompletionRequest(
+            model='openai/gpt-oss-20b',
+            messages=[{'role': 'user', 'content': 'hi'}],
+            response_format=ResponseFormat(
+                type='json_schema',
+                json_schema=JsonSchema(
+                    name='test',
+                    schema={'type': 'object', 'properties': {'x': {'type': 'integer'}}},
+                ),
+            ),
+        )
+        parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
+        assert parser.request.response_format is None
+
+    def test_schema_appended_to_existing_system_message(self):
+        """When a system message already exists the schema is appended to
+        it."""
+        import json as _json
+
+        from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
+
+        schema_dict = {'type': 'object', 'properties': {'x': {'type': 'integer'}}}
+        request = ChatCompletionRequest(
+            model='openai/gpt-oss-20b',
+            messages=[
+                {'role': 'system', 'content': 'You are helpful.'},
+                {'role': 'user', 'content': 'hi'},
+            ],
+            response_format=ResponseFormat(
+                type='json_schema',
+                json_schema=JsonSchema(name='test', schema=schema_dict),
+            ),
+        )
+        parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
+
+        msgs = parser.request.messages
+        assert msgs[0]['role'] == 'system'
+        assert parser.request.response_format is None
+        # The schema body must appear in the system message
+        assert '# Response Formats' in msgs[0]['content']
+        assert _json.dumps(schema_dict) in msgs[0]['content']
+        # The original content is preserved before the appended section
+        assert msgs[0]['content'].startswith('You are helpful.')
+        # No leading blank lines in the appended section
+        assert '\n\n# Response Formats' in msgs[0]['content']
+
+    def test_schema_inserted_as_new_system_message_when_none_exists(self):
+        """When no system message exists a new one is inserted at position
+        0."""
+        import json as _json
+
+        from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
+
+        schema_dict = {'type': 'object', 'properties': {'name': {'type': 'string'}}}
+        request = ChatCompletionRequest(
+            model='openai/gpt-oss-20b',
+            messages=[{'role': 'user', 'content': 'hi'}],
+            response_format=ResponseFormat(
+                type='json_schema',
+                json_schema=JsonSchema(name='test', schema=schema_dict),
+            ),
+        )
+        parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
+
+        msgs = parser.request.messages
+        assert msgs[0]['role'] == 'system'
+        assert parser.request.response_format is None
+        # New system message content must NOT start with blank lines
+        assert not msgs[0]['content'].startswith('\n')
+        assert msgs[0]['content'].startswith('# Response Formats')
+        assert _json.dumps(schema_dict) in msgs[0]['content']
+        # The user message is still present after the inserted system message
+        assert msgs[1]['role'] == 'user'
+
+    def test_text_response_format_is_cleared_by_normalize(self):
+        from lmdeploy.serve.openai.protocol import ResponseFormat
+
+        request = ChatCompletionRequest(
+            model='openai/gpt-oss-20b',
+            messages=[{'role': 'user', 'content': 'hi'}],
+            response_format=ResponseFormat(type='text'),
+        )
+        parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
+        assert parser.request.response_format is None
+
+    def test_no_response_format_leaves_request_unchanged(self):
+        """When response_format is None the request is not modified."""
+        request = ChatCompletionRequest(
+            model='openai/gpt-oss-20b',
+            messages=[{'role': 'user', 'content': 'hi'}],
+        )
+        parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
+        assert parser.request.response_format is None
+        assert len(parser.request.messages) == 1
+
+    def test_str_messages_gets_schema_appended(self):
+        """When messages is a string, the schema section is appended to it."""
+        import json as _json
+
+        from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
+
+        schema_dict = {'type': 'object', 'properties': {'x': {'type': 'integer'}}}
+        request = ChatCompletionRequest(
+            model='openai/gpt-oss-20b',
+            messages='Tell me a joke',
+            response_format=ResponseFormat(
+                type='json_schema',
+                json_schema=JsonSchema(name='test', schema=schema_dict),
+            ),
+        )
+        parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
+
+        assert parser.request.response_format is None
+        assert isinstance(parser.request.messages, str)
+        assert parser.request.messages.startswith('Tell me a joke')
+        assert '# Response Formats' in parser.request.messages
+        assert _json.dumps(schema_dict) in parser.request.messages
+
+    def test_non_pydantic_request_messages_updated(self):
+        """Non-Pydantic sentinel requests also get messages updated."""
+        import json as _json
+
+        from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
+
+        schema_dict = {'type': 'object', 'properties': {'y': {'type': 'number'}}}
+        fmt = ResponseFormat(
+            type='json_schema',
+            json_schema=JsonSchema(name='test', schema=schema_dict),
+        )
+
+        # Sentinel must NOT have tools/tool_choice attrs so that __init__
+        # skips the Pydantic-dependent tool-rendering branch.
+        class _Sentinel:
+            messages = [{'role': 'user', 'content': 'hi'}]
+            response_format = fmt
+
+        sentinel = _Sentinel()
+        parser = gpt_oss_mod.GptOssResponseParser(request=sentinel, tokenizer=object())
+
+        assert parser.request.response_format is None
+        msgs = parser.request.messages
+        assert isinstance(msgs, list)
+        assert msgs[0]['role'] == 'system'
+        assert '# Response Formats' in msgs[0]['content']
+        assert _json.dumps(schema_dict) in msgs[0]['content']
+
+    def test_list_content_system_message_gets_text_block_appended(self):
+        """When system message content is a list (multimodal), append a text
+        block."""
+        import json as _json
+
+        from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
+
+        schema_dict = {'type': 'object', 'properties': {'z': {'type': 'boolean'}}}
+        request = ChatCompletionRequest(
+            model='openai/gpt-oss-20b',
+            messages=[
+                {'role': 'system', 'content': [
+                    {'type': 'text', 'text': 'You are helpful.'},
+                    {'type': 'image_url', 'image_url': {'url': 'http://example.com/img.png'}},
+                ]},
+                {'role': 'user', 'content': 'hi'},
+            ],
+            response_format=ResponseFormat(
+                type='json_schema',
+                json_schema=JsonSchema(name='test', schema=schema_dict),
+            ),
+        )
+        parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
+
+        assert parser.request.response_format is None
+        sys_msg = parser.request.messages[0]
+        assert sys_msg['role'] == 'system'
+        content = sys_msg['content']
+        assert isinstance(content, list)
+        assert len(content) == 3
+        # Original two blocks preserved
+        assert content[0]['type'] == 'text'
+        assert content[0]['text'] == 'You are helpful.'
+        assert content[1]['type'] == 'image_url'
+        # Schema appended as a text block
+        assert content[2]['type'] == 'text'
+        assert '# Response Formats' in content[2]['text']
+        assert _json.dumps(schema_dict) in content[2]['text']
+
+    def test_none_content_system_message_inserts_separate_system(self):
+        """When system message content is None, insert a new system message."""
+        import json as _json
+
+        from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
+
+        schema_dict = {'type': 'object', 'properties': {'w': {'type': 'string'}}}
+        request = ChatCompletionRequest(
+            model='openai/gpt-oss-20b',
+            messages=[
+                {'role': 'system', 'content': None},
+                {'role': 'user', 'content': 'hi'},
+            ],
+            response_format=ResponseFormat(
+                type='json_schema',
+                json_schema=JsonSchema(name='test', schema=schema_dict),
+            ),
+        )
+        parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
+
+        assert parser.request.response_format is None
+        msgs = parser.request.messages
+        # A new system message with the schema is inserted at position 0
+        assert msgs[0]['role'] == 'system'
+        assert '# Response Formats' in msgs[0]['content']
+        assert _json.dumps(schema_dict) in msgs[0]['content']