Fix issues found in Copilot review of n parameter support

ziyangliu-666 · windreamer · commit df658b9dd7e8 · 2026-03-26T14:55:08.000+08:00
- Fix TypeError in chat streaming path: create_stream_response_json
  was returning model_dump_json() (str) but cache_block_ids injection
  subscripted it as a dict; switch to model_dump() + json.dumps()
- Fix stateful GptOssChatParser shared across concurrent asyncio.gather
  calls in non-streaming n&gt;1 path; create a fresh instance per choice,
  consistent with the streaming path
- Fix tool-call parse exceptions being swallowed and misreported as
  "Client disconnected"; re-raise so asyncio.gather propagates them,
  wrap gather in try/except to return INTERNAL_SERVER_ERROR
- Add missing test_completion_n_negative_rejected to match the
  existing test_chat_n_negative_rejected
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
@@ -546,7 +546,7 @@ def create_stream_response_json(index: int,
             choices=[choice_data],
             usage=usage,
         )
-        response_json = response.model_dump_json()
+        response_json = response.model_dump()
 
         return response_json
 
@@ -628,7 +628,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
                 if res.cache_block_ids is not None:
                     response_json['cache_block_ids'] = res.cache_block_ids
                     response_json['remote_token_ids'] = res.token_ids
-                yield f'data: {response_json}\n\n'
+                yield f'data: {json.dumps(response_json)}\n\n'
         yield 'data: [DONE]\n\n'
 
     # Streaming response
@@ -664,7 +664,8 @@ async def _collect_chat_response(_i, _gen, _sess):
             remote_token_ids_i.append(res.token_ids)
 
         if gpt_oss_parser:
-            message_i = gpt_oss_parser.parse_full(final_token_ids_i)
+            _parser_i = GptOssChatParser()
+            message_i = _parser_i.parse_full(final_token_ids_i)
             if final_res_i.finish_reason == 'stop' and len(message_i.tool_calls) > 0:
                 final_res_i.finish_reason = 'tool_calls'
         else:
@@ -679,7 +680,7 @@ async def _collect_chat_response(_i, _gen, _sess):
                             final_res_i.finish_reason = 'tool_calls'
                 except Exception as e:
                     logger.error(f'Failed to parse {text_i}. Exception: {e}.')
-                    return False
+                    raise
             elif request.tool_choice != 'none' and request.tools is not None and VariableInterface.tool_parser is None:
                 logger.error('Please launch the api_server with --tool-call-parser if you want to use tool.')
 
@@ -713,7 +714,10 @@ async def _collect_chat_response(_i, _gen, _sess):
         _completion_tokens += final_res_i.generate_token_len
         return True
 
-    results = await asyncio.gather(*[_collect_chat_response(_i, generators[_i], sessions[_i]) for _i in range(_n)])
+    try:
+        results = await asyncio.gather(*[_collect_chat_response(_i, generators[_i], sessions[_i]) for _i in range(_n)])
+    except Exception as e:
+        return create_error_response(HTTPStatus.INTERNAL_SERVER_ERROR, str(e))
     if not all(results):
         return create_error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected')
 
diff --git a/tests/test_lmdeploy/test_n_parameter.py b/tests/test_lmdeploy/test_n_parameter.py
@@ -92,6 +92,11 @@ def test_chat_n_negative_rejected(self):
         req = ChatCompletionRequest(model='m', messages='hi', n=-1)
         assert chat_check_request(req, ctx) != ''
 
+    def test_completion_n_negative_rejected(self):
+        ctx = self._make_server_context()
+        req = CompletionRequest(model='m', prompt='hi', n=-1)
+        assert completion_check_request(req, ctx) != ''
+
 
 # ---------------------------------------------------------------------------
 # API handler tests (mocking VariableInterface and raw_request)