fix: address unresolved PR review comments

windreamer · windreamer · commit cb76d4352de6 · 2026-05-07T17:41:06.000+08:00
1. Guard model_copy() with hasattr check: extract _clear_response_format()
   helper that falls back to in-place mutation for non-Pydantic request
   objects (e.g. test sentinels). Prevents double-raise in the except path.

2. Use logger.exception() instead of logger.error(f'...{e}') so that
   stack traces are preserved in the log output.

3. Mark _patch_streamable_parser fixture as autouse=True and remove
   redundant monkeypatch.setattr calls from individual test methods.
diff --git a/lmdeploy/serve/parsers/_openai_harmony.py b/lmdeploy/serve/parsers/_openai_harmony.py
@@ -87,10 +87,15 @@ def _convert_response_format_to_harmony(self):
             format_body = f'# Response Formats\n{format_json}'
             messages = self.request.messages
 
+            if isinstance(messages, str):
+                messages = messages + '\n\n' + format_body
+                self._clear_response_format(messages=messages)
+                return
+
             if not isinstance(messages, list):
                 logger.warning('Cannot inject response_format schema into '
                                'non-list messages for GPT-OSS; clearing response_format only.')
-                self.request = self.request.model_copy(update={'response_format': None})
+                self._clear_response_format()
                 return
 
             new_messages = list(messages)
@@ -100,22 +105,43 @@ def _convert_response_format_to_harmony(self):
             )
 
             if system_idx is not None:
-                content = new_messages[system_idx].get('content') or ''
-                new_messages[system_idx] = {
-                    **new_messages[system_idx],
-                    'content': content + '\n\n' + format_body,
-                }
+                content = new_messages[system_idx].get('content')
+                if isinstance(content, list):
+                    # Multimodal content blocks — append a text block.
+                    new_messages[system_idx] = {
+                        **new_messages[system_idx],
+                        'content': content + [{'type': 'text', 'text': format_body}],
+                    }
+                elif isinstance(content, str):
+                    new_messages[system_idx] = {
+                        **new_messages[system_idx],
+                        'content': (content + '\n\n' + format_body) if content else format_body,
+                    }
+                else:
+                    # content is None or unexpected type — insert a separate
+                    # system message so the schema is still available.
+                    new_messages.insert(0, {'role': 'system', 'content': format_body})
             else:
                 new_messages.insert(0, {'role': 'system', 'content': format_body})
 
-            self.request = self.request.model_copy(update={
-                'response_format': None,
-                'messages': new_messages,
-            })
-        except Exception as e:
-            logger.error(f'Failed to convert response_format to Harmony-native mode for GPT-OSS: {e}')
+            self._clear_response_format(messages=new_messages)
+        except Exception:
+            logger.exception('Failed to convert response_format to Harmony-native mode for GPT-OSS')
             # Still clear response_format to avoid the Harmony/JSON mode conflict
-            self.request = self.request.model_copy(update={'response_format': None})
+            self._clear_response_format()
+
+    def _clear_response_format(self, messages=None):
+        """Clear response_format on the request, handling both Pydantic and
+        plain objects."""
+        if hasattr(self.request, 'model_copy'):
+            update = {'response_format': None}
+            if messages is not None:
+                update['messages'] = messages
+            self.request = self.request.model_copy(update=update)
+        else:
+            self.request.response_format = None
+            if messages is not None:
+                self.request.messages = messages
 
     def stream_chunk(self, delta_text: str, delta_token_ids: list[int], **kwargs) -> tuple[DeltaMessage | None, bool]:
         if (
diff --git a/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py b/tests/test_lmdeploy/serve/parsers/test_gpt_oss_parser.py
@@ -335,21 +335,16 @@ class TestGptOssResponseFormatHarmonyConversion:
     """Tests for
     :meth:`GptOssResponseParser._convert_response_format_to_harmony`."""
 
-    @pytest.fixture()
+    @pytest.fixture(autouse=True)
     def _patch_streamable_parser(self, monkeypatch):
         monkeypatch.setattr(
             openai_harmony_mod,
             'StreamableParser',
             lambda *args, **kwargs: _FakeStreamableParser({}),
         )
 
-    def test_response_format_cleared_after_conversion(self, monkeypatch):
+    def test_response_format_cleared_after_conversion(self):
         """response_format must be None after the parser processes it."""
-        monkeypatch.setattr(
-            openai_harmony_mod,
-            'StreamableParser',
-            lambda *args, **kwargs: _FakeStreamableParser({}),
-        )
         from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
 
         request = ChatCompletionRequest(
@@ -366,14 +361,9 @@ def test_response_format_cleared_after_conversion(self, monkeypatch):
         parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
         assert parser.request.response_format is None
 
-    def test_schema_appended_to_existing_system_message(self, monkeypatch):
+    def test_schema_appended_to_existing_system_message(self):
         """When a system message already exists the schema is appended to
         it."""
-        monkeypatch.setattr(
-            openai_harmony_mod,
-            'StreamableParser',
-            lambda *args, **kwargs: _FakeStreamableParser({}),
-        )
         import json as _json
 
         from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
@@ -403,14 +393,9 @@ def test_schema_appended_to_existing_system_message(self, monkeypatch):
         # No leading blank lines in the appended section
         assert '\n\n# Response Formats' in msgs[0]['content']
 
-    def test_schema_inserted_as_new_system_message_when_none_exists(self, monkeypatch):
+    def test_schema_inserted_as_new_system_message_when_none_exists(self):
         """When no system message exists a new one is inserted at position
         0."""
-        monkeypatch.setattr(
-            openai_harmony_mod,
-            'StreamableParser',
-            lambda *args, **kwargs: _FakeStreamableParser({}),
-        )
         import json as _json
 
         from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
@@ -436,13 +421,8 @@ def test_schema_inserted_as_new_system_message_when_none_exists(self, monkeypatc
         # The user message is still present after the inserted system message
         assert msgs[1]['role'] == 'user'
 
-    def test_text_response_format_is_not_converted(self, monkeypatch):
+    def test_text_response_format_is_not_converted(self):
         """A text-type response_format should be left untouched."""
-        monkeypatch.setattr(
-            openai_harmony_mod,
-            'StreamableParser',
-            lambda *args, **kwargs: _FakeStreamableParser({}),
-        )
         from lmdeploy.serve.openai.protocol import ResponseFormat
 
         request = ChatCompletionRequest(
@@ -454,17 +434,129 @@ def test_text_response_format_is_not_converted(self, monkeypatch):
         assert parser.request.response_format is not None
         assert parser.request.response_format.type == 'text'
 
-    def test_no_response_format_leaves_request_unchanged(self, monkeypatch):
+    def test_no_response_format_leaves_request_unchanged(self):
         """When response_format is None the request is not modified."""
-        monkeypatch.setattr(
-            openai_harmony_mod,
-            'StreamableParser',
-            lambda *args, **kwargs: _FakeStreamableParser({}),
-        )
         request = ChatCompletionRequest(
             model='openai/gpt-oss-20b',
             messages=[{'role': 'user', 'content': 'hi'}],
         )
         parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
         assert parser.request.response_format is None
         assert len(parser.request.messages) == 1
+
+    def test_str_messages_gets_schema_appended(self):
+        """When messages is a string, the schema section is appended to it."""
+        import json as _json
+
+        from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
+
+        schema_dict = {'type': 'object', 'properties': {'x': {'type': 'integer'}}}
+        request = ChatCompletionRequest(
+            model='openai/gpt-oss-20b',
+            messages='Tell me a joke',
+            response_format=ResponseFormat(
+                type='json_schema',
+                json_schema=JsonSchema(name='test', schema=schema_dict),
+            ),
+        )
+        parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
+
+        assert parser.request.response_format is None
+        assert isinstance(parser.request.messages, str)
+        assert parser.request.messages.startswith('Tell me a joke')
+        assert '# Response Formats' in parser.request.messages
+        assert _json.dumps(schema_dict) in parser.request.messages
+
+    def test_non_pydantic_request_messages_updated(self):
+        """Non-Pydantic sentinel requests also get messages updated."""
+        import json as _json
+
+        from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
+
+        schema_dict = {'type': 'object', 'properties': {'y': {'type': 'number'}}}
+        fmt = ResponseFormat(
+            type='json_schema',
+            json_schema=JsonSchema(name='test', schema=schema_dict),
+        )
+
+        # Sentinel must NOT have tools/tool_choice attrs so that __init__
+        # skips the Pydantic-dependent tool-rendering branch.
+        class _Sentinel:
+            messages = [{'role': 'user', 'content': 'hi'}]
+            response_format = fmt
+
+        sentinel = _Sentinel()
+        parser = gpt_oss_mod.GptOssResponseParser(request=sentinel, tokenizer=object())
+
+        assert parser.request.response_format is None
+        msgs = parser.request.messages
+        assert isinstance(msgs, list)
+        assert msgs[0]['role'] == 'system'
+        assert '# Response Formats' in msgs[0]['content']
+        assert _json.dumps(schema_dict) in msgs[0]['content']
+
+    def test_list_content_system_message_gets_text_block_appended(self):
+        """When system message content is a list (multimodal), append a text
+        block."""
+        import json as _json
+
+        from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
+
+        schema_dict = {'type': 'object', 'properties': {'z': {'type': 'boolean'}}}
+        request = ChatCompletionRequest(
+            model='openai/gpt-oss-20b',
+            messages=[
+                {'role': 'system', 'content': [
+                    {'type': 'text', 'text': 'You are helpful.'},
+                    {'type': 'image_url', 'image_url': {'url': 'http://example.com/img.png'}},
+                ]},
+                {'role': 'user', 'content': 'hi'},
+            ],
+            response_format=ResponseFormat(
+                type='json_schema',
+                json_schema=JsonSchema(name='test', schema=schema_dict),
+            ),
+        )
+        parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
+
+        assert parser.request.response_format is None
+        sys_msg = parser.request.messages[0]
+        assert sys_msg['role'] == 'system'
+        content = sys_msg['content']
+        assert isinstance(content, list)
+        assert len(content) == 3
+        # Original two blocks preserved
+        assert content[0]['type'] == 'text'
+        assert content[0]['text'] == 'You are helpful.'
+        assert content[1]['type'] == 'image_url'
+        # Schema appended as a text block
+        assert content[2]['type'] == 'text'
+        assert '# Response Formats' in content[2]['text']
+        assert _json.dumps(schema_dict) in content[2]['text']
+
+    def test_none_content_system_message_inserts_separate_system(self):
+        """When system message content is None, insert a new system message."""
+        import json as _json
+
+        from lmdeploy.serve.openai.protocol import JsonSchema, ResponseFormat
+
+        schema_dict = {'type': 'object', 'properties': {'w': {'type': 'string'}}}
+        request = ChatCompletionRequest(
+            model='openai/gpt-oss-20b',
+            messages=[
+                {'role': 'system', 'content': None},
+                {'role': 'user', 'content': 'hi'},
+            ],
+            response_format=ResponseFormat(
+                type='json_schema',
+                json_schema=JsonSchema(name='test', schema=schema_dict),
+            ),
+        )
+        parser = gpt_oss_mod.GptOssResponseParser(request=request, tokenizer=object())
+
+        assert parser.request.response_format is None
+        msgs = parser.request.messages
+        # A new system message with the schema is inserted at position 0
+        assert msgs[0]['role'] == 'system'
+        assert '# Response Formats' in msgs[0]['content']
+        assert _json.dumps(schema_dict) in msgs[0]['content']