Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 4 additions & 10 deletions lmdeploy/serve/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,9 +393,6 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
request_id = str(session.session_id)
created_time = int(time.time())

if isinstance(request.stop, str):
request.stop = [request.stop]

tokenizer = VariableInterface.async_engine.tokenizer.model.model
gen_logprobs, logits_processors = None, None
if request.logprobs and request.top_logprobs:
Expand All @@ -416,18 +413,15 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
'Please launch the api_server with --tool-call-parser if you want to use tool.')

random_seed = request.seed if request.seed is not None else None
max_new_tokens = (request.max_completion_tokens if request.max_completion_tokens else request.max_tokens)
response_format = None
if request.response_format and request.response_format.type != 'text':
response_format = request.response_format.model_dump()

parser_cls = VariableInterface.response_parser_cls
response_parser = parser_cls(request=request, tokenizer=tokenizer)
# request might be adjusted by tool parser
# request is normalized and may be adjusted by the parser
# (e.g. GPT-OSS clears response_format and injects the schema into messages)
request = response_parser.request

gen_config = GenerationConfig(
max_new_tokens=max_new_tokens,
max_new_tokens=request.max_completion_tokens,
do_sample=True,
logprobs=gen_logprobs,
top_k=request.top_k,
Expand All @@ -438,7 +432,7 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
stop_words=request.stop,
include_stop_str_in_output=request.include_stop_str_in_output,
skip_special_tokens=request.skip_special_tokens,
response_format=response_format,
response_format=request.response_format,
logits_processors=logits_processors,
min_new_tokens=request.min_new_tokens,
min_p=request.min_p,
Expand Down
82 changes: 81 additions & 1 deletion lmdeploy/serve/parsers/_openai_harmony.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
available."""
from __future__ import annotations

import json
import re
from typing import TYPE_CHECKING

Expand All @@ -16,14 +17,17 @@
FunctionCall,
ToolCall,
)
from lmdeploy.utils import get_logger

from .response_parser import ResponseParser, ResponseParserManager
from .response_parser import ResponseParser, ResponseParserManager, normalize_chat_request

if TYPE_CHECKING:
from transformers import PreTrainedTokenizerBase

from lmdeploy.serve.openai.protocol import ChatCompletionRequest

logger = get_logger('lmdeploy')

_harmony_encoding = None


Expand Down Expand Up @@ -55,6 +59,8 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize
else:
# Unit tests may inject a lightweight sentinel request object.
self.request = request
self._convert_response_format_to_harmony()
self.request = normalize_chat_request(self.request)
Comment thread
windreamer marked this conversation as resolved.
self.model_tokenizer = tokenizer
self.parser = StreamableParser(get_encoding(), role=Role.ASSISTANT)
self._seen_any = False
Expand All @@ -64,6 +70,80 @@ def __init__(self, request: ChatCompletionRequest, tokenizer: PreTrainedTokenize
self._active_tool_name: str | None = None
self.tool_parser = object() # API server checks `is not None` for tool support.

def _convert_response_format_to_harmony(self):
"""Convert response_format to Harmony-native mode for GPT-OSS.

GPT-OSS uses Harmony mode for structured output, which conflicts with
the engine's built-in JSON/response-format mode. This method injects
the response_format schema into the system prompt as a
``# Response Formats`` section and clears ``response_format`` on the
request so that only the Harmony-native instructions are used.
"""
fmt = getattr(self.request, 'response_format', None)
if fmt is None or getattr(fmt, 'type', 'text') == 'text':
return

try:
format_json = json.dumps(fmt.model_dump())
format_body = f'# Response Formats\n{format_json}'
messages = self.request.messages

if isinstance(messages, str):
messages = messages + '\n\n' + format_body
self._clear_response_format(messages=messages)
return

if not isinstance(messages, list):
logger.warning('Cannot inject response_format schema into '
'non-list messages for GPT-OSS; clearing response_format only.')
Comment thread
windreamer marked this conversation as resolved.
self._clear_response_format()
return

new_messages = list(messages)
system_idx = next(
(i for i, msg in enumerate(new_messages) if isinstance(msg, dict) and msg.get('role') == 'system'),
None,
)

if system_idx is not None:
content = new_messages[system_idx].get('content')
if isinstance(content, list):
# Multimodal content blocks — append a text block.
new_messages[system_idx] = {
**new_messages[system_idx],
'content': content + [{'type': 'text', 'text': format_body}],
}
elif isinstance(content, str):
new_messages[system_idx] = {
**new_messages[system_idx],
'content': (content + '\n\n' + format_body) if content else format_body,
}
else:
# content is None or unexpected type — insert a separate
# system message so the schema is still available.
new_messages.insert(0, {'role': 'system', 'content': format_body})
else:
new_messages.insert(0, {'role': 'system', 'content': format_body})

self._clear_response_format(messages=new_messages)
except Exception:
logger.exception('Failed to convert response_format to Harmony-native mode for GPT-OSS')
# Still clear response_format to avoid the Harmony/JSON mode conflict
self._clear_response_format()

def _clear_response_format(self, messages=None):
"""Clear response_format on the request, handling both Pydantic and
plain objects."""
if hasattr(self.request, 'model_copy'):
update = {'response_format': None}
if messages is not None:
update['messages'] = messages
self.request = self.request.model_copy(update=update)
else:
self.request.response_format = None
Comment thread
windreamer marked this conversation as resolved.
if messages is not None:
self.request.messages = messages

Comment thread
windreamer marked this conversation as resolved.
Comment thread
windreamer marked this conversation as resolved.
def stream_chunk(self, delta_text: str, delta_token_ids: list[int], **kwargs) -> tuple[DeltaMessage | None, bool]:
if (
not delta_text
Expand Down
35 changes: 35 additions & 0 deletions lmdeploy/serve/parsers/response_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,38 @@
ResponseParserManager = Registry('response_parser', locations=['lmdeploy.serve.parsers.response_parser'])


def normalize_chat_request(request: ChatCompletionRequest) -> ChatCompletionRequest:
"""Normalize a ChatCompletionRequest for downstream consumption.

- ``response_format``: ``ResponseFormat → dict``, ``type='text' → None``
- ``stop``: ``str → list[str]``
- ``max_completion_tokens``: resolves from deprecated ``max_tokens``
"""
if not hasattr(request, 'model_copy'):
return request

updates: dict = {}

fmt = request.response_format
if fmt is not None and fmt.type != 'text':
updates['response_format'] = fmt.model_dump()
elif fmt is not None and fmt.type == 'text':
updates['response_format'] = None

if isinstance(request.stop, str):
updates['stop'] = [request.stop]

if request.max_completion_tokens is None:
max_tokens = getattr(request, 'max_tokens', None)
if max_tokens is not None:
updates['max_completion_tokens'] = max_tokens

if updates:
request = request.model_copy(update=updates)

return request


class ResponseParser:
@classmethod
def set_parsers(cls, reasoning_parser_name: str | None = None, tool_parser_name: str | None = None) -> None:
Expand Down Expand Up @@ -162,6 +194,9 @@ def __init__(
self.request = self.tool_parser.adjust_request(request)
else:
self.request = self.dump_tools(request)

self.request = normalize_chat_request(self.request)

self._accumulated_text = ''

self.profile = self._build_profile()
Expand Down
Loading
Loading