diff --git a/lmdeploy/deepseek_v32_encoding.py b/lmdeploy/deepseek_v32_encoding.py new file mode 100644 index 0000000000..afeac6e34b --- /dev/null +++ b/lmdeploy/deepseek_v32_encoding.py @@ -0,0 +1,394 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Adapted from deepseek-ai/DeepSeek-V3.2 encoding/encoding_dsv32.py. +import copy +import json +import re +from typing import Any + +TOOLS_SYSTEM_TEMPLATE = ( + '## Tools\n\n' + "You have access to a set of tools you can use to answer the user's question.\n" + 'You can invoke functions by writing a "<{dsml_token}function_calls>" block like the following as part of ' + 'your reply to the user:\n' + '<{dsml_token}function_calls>\n' + '<{dsml_token}invoke name="$FUNCTION_NAME">\n' + '<{dsml_token}parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE\n' + '...\n' + '\n' + '<{dsml_token}invoke name="$FUNCTION_NAME2">\n' + '...\n' + '\n' + '\n\n' + 'String and scalar parameters should be specified as is without any escaping or quotes, while lists and objects ' + 'should use JSON format. The "string" attribute should be set to "true" for string type parameters and "false" ' + 'for other types (numbers, booleans, arrays, objects).\n\n' + 'If the thinking_mode is enabled, then after function results you should strongly consider outputting a thinking ' + 'block. Here is an example:\n\n' + '<{dsml_token}function_calls>\n' + '...\n' + '\n\n' + '\n' + '...\n' + '\n\n' + '{thinking_start_token}...thinking about results{thinking_end_token}\n\n' + 'Here are the functions available in JSONSchema format:\n' + '\n' + '{tool_schemas}\n' + '\n' +) + +bos_token: str = '<|begin▁of▁sentence|>' +eos_token: str = '<|end▁of▁sentence|>' +thinking_start_token: str = '' +thinking_end_token: str = '' +dsml_token: str = '|DSML|' +system_msg_template: str = '{content}' +user_msg_template: str = '<|User|>{content}<|Assistant|>' +assistant_msg_template: str = '{reasoning}{content}{tool_calls}<|end▁of▁sentence|>' +thinking_template = '{reasoning_content}' + +response_format_template: str = ( + '## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}' +) +tool_call_template: str = ( + "<{dsml_token}invoke name=\"{name}\">\n{arguments}\n" +) +tool_calls_template = ( + '<{dsml_token}function_calls>\n{tool_calls}\n' +) + +tool_output_template: str = ( + '\n{content}' +) + +def to_json(value: Any) -> str: + try: + return json.dumps(value, ensure_ascii=False) + except Exception: + return json.dumps(value, ensure_ascii=True) + +def tools_from_openai_format(tools): + return [tool['function'] for tool in tools] + +def tool_calls_from_openai_format(tool_calls): + return [ + { + 'name': tool_call['function']['name'], + 'arguments': tool_call['function']['arguments'], + } + for tool_call in tool_calls + ] + +def tool_calls_to_openai_format(tool_calls): + return [ + { + 'type': 'function', + 'function': { + 'name': tool_call['name'], + 'arguments': tool_call['arguments'], + } + } + for tool_call in tool_calls + ] + +def encode_arguments_to_dsml(tool_call: dict[str, str]) -> str: + p_dsml_template = """<{dsml_token}parameter name="{key}" string="{is_str}">{value}""" + P_dsml_strs = [] + + raw_arguments = tool_call['arguments'] + arguments = json.loads(raw_arguments) if isinstance(raw_arguments, str) else raw_arguments + if not isinstance(arguments, dict): + raise ValueError('Assistant tool call function.arguments must be a JSON object.') + + for k, v in arguments.items(): + p_dsml_str = p_dsml_template.format( + dsml_token=dsml_token, + key=k, + is_str='true' if isinstance(v, str) else 'false', + value=v if isinstance(v, str) else to_json(v), + ) + + P_dsml_strs.append(p_dsml_str) + + return '\n'.join(P_dsml_strs) + + +def decode_dsml_to_arguments(tool_name: str, tool_args: dict[str, tuple[str, str]]) -> dict[str, str]: + def _decode_value(key: str, value: str, string: str): + if string == 'true': + value = to_json(value) + return f'{to_json(key)}: {value}' + + tool_args_json = '{' + ', '.join([_decode_value(k, v, string=is_str) for k, (v, is_str) in tool_args.items()]) + '}' + return dict(name=tool_name, arguments=tool_args_json) + +def render_tools(tools: list[dict[str, str | dict[str, Any]]]) -> str: + tools_json = [to_json(t) for t in tools] + + return TOOLS_SYSTEM_TEMPLATE.format( + tool_schemas='\n'.join(tools_json), + dsml_token=dsml_token, + thinking_start_token=thinking_start_token, + thinking_end_token=thinking_end_token, + ) + +def find_last_user_index(messages: list[dict[str, Any]]) -> int: + last_user_index = -1 + for idx in range(len(messages)-1, -1, -1): + if messages[idx].get('role') in ['user', 'developer']: + last_user_index = idx + break + return last_user_index + +def render_message(index: int, messages: list[dict[str, Any]], thinking_mode: str) -> str: + assert 0 <= index < len(messages) + assert thinking_mode in ['chat', 'thinking'], f'Invalid thinking_mode `{thinking_mode}`' + + prompt = '' + msg = messages[index] + last_user_idx = find_last_user_index(messages) + + role = msg.get('role') + content = msg.get('content') + tools = msg.get('tools') + response_format = msg.get('response_format') + tool_calls = msg.get('tool_calls') + reasoning_content = msg.get('reasoning_content') + + if tools: + tools = tools_from_openai_format(tools) + if tool_calls: + tool_calls = tool_calls_from_openai_format(tool_calls) + + if role == 'system': + prompt += system_msg_template.format(content=content or '') + if tools: + prompt += '\n\n' + render_tools(tools) + + if response_format: + prompt += '\n\n' + response_format_template.format(schema=to_json(response_format)) + + elif role == 'developer': + assert content, f'Invalid message for role `{role}`: {msg}' + content_developer = '' + if tools: + content_developer += '\n\n' + render_tools(tools) + + if response_format: + content_developer += '\n\n' + response_format_template.format(schema=to_json(response_format)) + + content_developer += f"\n\n# The user's message is: {content}" + + prompt += user_msg_template.format(content=content_developer) + if index == last_user_idx and thinking_mode == 'thinking': + prompt += thinking_start_token + else: + prompt += thinking_end_token + + elif role == 'user': + prompt += user_msg_template.format(content=content) + + if index == last_user_idx and thinking_mode == 'thinking': + prompt += thinking_start_token + else: + prompt += thinking_end_token + + elif role == 'tool': + prev_assistant_idx = index - 1 + assistant_msg = messages[prev_assistant_idx] + while prev_assistant_idx >= 0 and assistant_msg.get('role') == 'tool': + prev_assistant_idx -= 1 + assistant_msg = messages[prev_assistant_idx] + + assert ( + index == 0 or prev_assistant_idx >= 0 and assistant_msg.get('role') == 'assistant' + ), f'Invalid messages at {index}:\n{assistant_msg}' + + tool_call_order = index - prev_assistant_idx + assistant_tool_calls = assistant_msg.get('tool_calls') + assert ( + assistant_tool_calls and len(assistant_tool_calls) >= tool_call_order + ), 'No tool calls but found tool output' + + if tool_call_order == 1: + prompt += '\n\n' + + prompt += tool_output_template.format(content=content) + + if tool_call_order == len(assistant_tool_calls): + prompt += '\n' + + if index >= last_user_idx and thinking_mode == 'thinking': + prompt += '\n\n' + thinking_start_token + else: + prompt += '\n\n' + thinking_end_token + + elif role == 'assistant': + prev_assistant_idx = index + thinking_part = '' + + tool_calls_content = '' + if tool_calls: + tool_calls = [ + tool_call_template.format( + dsml_token=dsml_token, + name=tool_call.get('name'), + arguments=encode_arguments_to_dsml(tool_call) + ) + for tool_call in tool_calls + ] + tool_calls_content += '\n\n' + tool_calls_template.format( + dsml_token=dsml_token, + tool_calls='\n'.join(tool_calls) + ) + + summary_content = content or '' + + if thinking_mode == 'thinking' and index > last_user_idx: + assert reasoning_content or tool_calls, ( + f'ThinkingMode: {thinking_mode}, invalid message without reasoning_content/tool_calls `{msg}` ' + 'after last user message') + thinking_part = thinking_template.format(reasoning_content=reasoning_content or '') + thinking_end_token + + prompt += assistant_msg_template.format( + reasoning=thinking_part, + content=summary_content, + tool_calls=tool_calls_content, + ) + else: + raise NotImplementedError(f'Unknown role: {role}') + + return prompt + +def drop_thinking_messages(messages: list[dict[str, Any]], last_user_idx: int | None = None) -> list[dict[str, Any]]: + messages_wo_thinking: list[dict[str, Any]] = [] + last_user_idx = find_last_user_index(messages) if last_user_idx is None else last_user_idx + for idx, msg in enumerate(messages): + role = msg.get('role') + if role in ['user', 'system', 'tool'] or idx >= last_user_idx: + messages_wo_thinking.append(msg) + continue + + elif role == 'assistant': + msg_wo_thinking = copy.copy(msg) + msg_wo_thinking.pop('reasoning_content', None) + messages_wo_thinking.append(msg_wo_thinking) + + return messages_wo_thinking + +def encode_messages(messages: list[dict[str, Any]], + thinking_mode: str, + context: list[dict[str, Any]] | None = None, + drop_thinking: bool = True, + add_default_bos_token: bool = True) -> str: + context = context if context else [] + full_messages = context + messages + + prompt = bos_token if add_default_bos_token and len(context) == 0 else '' + + if thinking_mode == 'thinking' and drop_thinking: + full_messages = drop_thinking_messages(full_messages) + + for idx in range(len(messages)): + prompt += render_message(idx + len(context), full_messages, thinking_mode=thinking_mode) + + return prompt + +def _read_until_stop(index: int, text: str, stop: list[str]) -> tuple[int, str, str | None]: + min_pos = len(text) + matched_stop = None + + for s in stop: + pos = text.find(s, index) + if pos != -1 and pos < min_pos: + min_pos = pos + matched_stop = s + + if matched_stop: + content = text[index:min_pos] + return min_pos + len(matched_stop), content, matched_stop + else: + content = text[index:] + return len(text), content, None + +def parse_tool_calls(index: int, text: str): + tool_calls: list[dict[str, Any]] = [] + stop_token = None + tool_calls_end_token = f'' + + while index < len(text): + index, _, stop_token = _read_until_stop(index, text, [f'<{dsml_token}invoke', tool_calls_end_token]) + assert _ == '>\n', 'Tool call format error' + + if stop_token == tool_calls_end_token: + break + + assert stop_token is not None, 'Missing special token' + + index, tool_name_content, stop_token = _read_until_stop( + index, text, [f'<{dsml_token}parameter', f'\n$', tool_name_content, flags=re.DOTALL) + assert len(p_tool_name) == 1, 'Tool name format error' + tool_name = p_tool_name[0] + + tool_args: dict[str, tuple[str, str]] = {} + while stop_token == f'<{dsml_token}parameter': + index, param_content, stop_token = _read_until_stop(index, text, [f'/{dsml_token}parameter']) + + param_kv = re.findall(r'^ name="(.*?)" string="(true|false)">(.*?)<$', param_content, flags=re.DOTALL) + assert len(param_kv) == 1, 'Parameter format error' + param_name, string, param_value = param_kv[0] + + assert param_name not in tool_args, 'Duplicate parameter name' + tool_args[param_name] = (param_value, string) + + index, content, stop_token = _read_until_stop( + index, text, [f'<{dsml_token}parameter', f'\n', 'Parameter format error' + + tool_call = decode_dsml_to_arguments(tool_name=tool_name, tool_args=tool_args) + tool_calls.append(tool_call) + + return index, stop_token, tool_calls + +# NOTE: This function parses only correctly formatted strings and will not attempt to correct +# malformed output that may be generated by the model. +def parse_message_from_completion_text(text: str, thinking_mode: str): + summary_content, reasoning_content, tool_calls = '', '', [] + index, stop_token = 0, None + tool_calls_start_token = f'\n\n<{dsml_token}function_calls' + + is_thinking, is_tool_calling = thinking_mode == 'thinking', False + + if is_thinking: + index, content_delta, stop_token = _read_until_stop(index, text, [thinking_end_token, tool_calls_start_token]) + reasoning_content = content_delta + assert stop_token == thinking_end_token, 'Invalid thinking format' + + index, content_delta, stop_token = _read_until_stop(index, text, [eos_token, tool_calls_start_token]) + summary_content = content_delta + if stop_token == tool_calls_start_token: + is_tool_calling = True + else: + assert stop_token == eos_token, 'Invalid summary format' + + if is_tool_calling: + index, stop_token, tool_calls = parse_tool_calls(index, text) + + index, tool_ends_text, stop_token = _read_until_stop(index, text, [eos_token]) + assert not tool_ends_text, 'Unexpected content after tool calls' + + assert len(text) == index and stop_token in [eos_token, None], 'Unexpected content at end' + + for sp_token in [bos_token, eos_token, thinking_start_token, thinking_end_token, dsml_token]: + assert ( + sp_token not in summary_content and sp_token not in reasoning_content + ), 'Unexpected special token in content' + + return { + 'role': 'assistant', + 'content': summary_content, + 'reasoning_content': reasoning_content, + 'tool_calls': tool_calls_to_openai_format(tool_calls) + } diff --git a/lmdeploy/deepseek_v4_encoding.py b/lmdeploy/deepseek_v4_encoding.py new file mode 100644 index 0000000000..c24c4c5584 --- /dev/null +++ b/lmdeploy/deepseek_v4_encoding.py @@ -0,0 +1,743 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Adapted from deepseek-ai/DeepSeek-V4-Pro encoding/encoding_dsv4.py. +"""DeepSeek-V4 Encoding. + +A self-contained implementation for encoding/decoding DeepSeek-V4 chat messages with tool calling, thinking mode, and +quick instruction task support. +""" + +import copy +import json +import re +from typing import Any + +# ============================================================ +# Special Tokens +# ============================================================ + +bos_token: str = '<|begin▁of▁sentence|>' +eos_token: str = '<|end▁of▁sentence|>' +thinking_start_token: str = '' +thinking_end_token: str = '' +dsml_token: str = '|DSML|' + +USER_SP_TOKEN = '<|User|>' +ASSISTANT_SP_TOKEN = '<|Assistant|>' +LATEST_REMINDER_SP_TOKEN = '<|latest_reminder|>' + +# Task special tokens for internal classification tasks +DS_TASK_SP_TOKENS = { + 'action': '<|action|>', + 'query': '<|query|>', + 'authority': '<|authority|>', + 'domain': '<|domain|>', + 'title': '<|title|>', + 'read_url': '<|read_url|>', +} +VALID_TASKS = set(DS_TASK_SP_TOKENS.keys()) + +# ============================================================ +# Templates +# ============================================================ + +system_msg_template: str = '{content}' +user_msg_template: str = '{content}' +latest_reminder_msg_template: str = '{content}' +assistant_msg_template: str = '{reasoning}{content}{tool_calls}' + eos_token +assistant_msg_wo_eos_template: str = '{reasoning}{content}{tool_calls}' +thinking_template: str = '{reasoning_content}' + +response_format_template: str = ( + '## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}' +) +tool_call_template: str = ( + "<{dsml_token}invoke name=\"{name}\">\n{arguments}\n" +) +tool_calls_template = ( + '<{dsml_token}{tc_block_name}>\n{tool_calls}\n' +) +tool_calls_block_name: str = 'tool_calls' + +tool_output_template: str = ( + '{content}' +) + +REASONING_EFFORT_MAX = ( + 'Reasoning Effort: Absolute maximum with no shortcuts permitted.\n' + 'You MUST be very thorough in your thinking and comprehensively decompose the problem to resolve the ' + 'root cause, rigorously stress-testing your logic against all potential paths, edge cases, and adversarial ' + 'scenarios.\n' + 'Explicitly write out your entire deliberation process, documenting every intermediate step, considered ' + 'alternative, and rejected hypothesis to ensure absolutely no assumption is left unchecked.\n\n' +) + +TOOLS_TEMPLATE = ( + '## Tools\n\n' + "You have access to a set of tools to help answer the user's question. You can invoke tools by writing a " + '"<{dsml_token}tool_calls>" block like the following:\n\n' + '<{dsml_token}tool_calls>\n' + '<{dsml_token}invoke name="$TOOL_NAME">\n' + '<{dsml_token}parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE\n' + '...\n' + '\n' + '<{dsml_token}invoke name="$TOOL_NAME2">\n' + '...\n' + '\n' + '\n\n' + 'String parameters should be specified as is and set `string="true"`. For all other types (numbers, booleans, ' + 'arrays, objects), pass the value in JSON format and set `string="false"`.\n\n' + 'If thinking_mode is enabled (triggered by {thinking_start_token}), you MUST output your complete reasoning ' + 'inside {thinking_start_token}...{thinking_end_token} BEFORE any tool calls or final response.\n\n' + 'Otherwise, output directly after {thinking_end_token} with tool calls or final response.\n\n' + '### Available Tool Schemas\n\n' + '{tool_schemas}\n\n' + 'You MUST strictly follow the above defined tool name and parameter schemas to invoke tool calls.\n' +) + +# ============================================================ +# Utility Functions +# ============================================================ + +def to_json(value: Any) -> str: + """Serialize a value to JSON string.""" + try: + return json.dumps(value, ensure_ascii=False) + except Exception: + return json.dumps(value, ensure_ascii=True) + + +def tools_from_openai_format(tools): + """Extract function definitions from OpenAI-format tool list.""" + return [tool['function'] for tool in tools] + + +def tool_calls_from_openai_format(tool_calls): + """Convert OpenAI-format tool calls to internal format.""" + return [ + { + 'name': tool_call['function']['name'], + 'arguments': tool_call['function']['arguments'], + } + for tool_call in tool_calls + ] + + +def tool_calls_to_openai_format(tool_calls): + """Convert internal tool calls to OpenAI format.""" + return [ + { + 'type': 'function', + 'function': { + 'name': tool_call['name'], + 'arguments': tool_call['arguments'], + } + } + for tool_call in tool_calls + ] + + +def encode_arguments_to_dsml(tool_call: dict[str, str]) -> str: + """Encode tool call arguments into DSML parameter format. + + Args: + tool_call: Dict with "name" and "arguments" (JSON string) keys. + + Returns: + DSML-formatted parameter string. + """ + p_dsml_template = '<{dsml_token}parameter name="{key}" string="{is_str}">{value}' + P_dsml_strs = [] + + try: + arguments = json.loads(tool_call['arguments']) + except Exception: + arguments = {'arguments': tool_call['arguments']} + + for k, v in arguments.items(): + p_dsml_str = p_dsml_template.format( + dsml_token=dsml_token, + key=k, + is_str='true' if isinstance(v, str) else 'false', + value=v if isinstance(v, str) else to_json(v), + ) + P_dsml_strs.append(p_dsml_str) + + return '\n'.join(P_dsml_strs) + + +def decode_dsml_to_arguments(tool_name: str, tool_args: dict[str, tuple[str, str]]) -> dict[str, str]: + """Decode DSML parameters back to a tool call dict. + + Args: + tool_name: Name of the tool. + tool_args: Dict mapping param_name -> (value, is_string_flag). + + Returns: + Dict with "name" and "arguments" (JSON string) keys. + """ + def _decode_value(key: str, value: str, string: str): + if string == 'true': + value = to_json(value) + return f'{to_json(key)}: {value}' + + tool_args_json = '{' + ', '.join([_decode_value(k, v, string=is_str) for k, (v, is_str) in tool_args.items()]) + '}' + return dict(name=tool_name, arguments=tool_args_json) + + +def render_tools(tools: list[dict[str, str | dict[str, Any]]]) -> str: + """Render tool schemas into the system prompt format. + + Args: + tools: List of tool schema dicts (each with name, description, parameters). + + Returns: + Formatted tools section string. + """ + tools_json = [to_json(t) for t in tools] + + return TOOLS_TEMPLATE.format( + tool_schemas='\n'.join(tools_json), + dsml_token=dsml_token, + thinking_start_token=thinking_start_token, + thinking_end_token=thinking_end_token, + ) + + +def find_last_user_index(messages: list[dict[str, Any]]) -> int: + """Find the index of the last user/developer message.""" + last_user_index = -1 + for idx in range(len(messages) - 1, -1, -1): + if messages[idx].get('role') in ['user', 'developer']: + last_user_index = idx + break + return last_user_index + + +# ============================================================ +# Message Rendering +# ============================================================ + +def render_message(index: int, + messages: list[dict[str, Any]], + thinking_mode: str, + drop_thinking: bool = True, + reasoning_effort: str | None = None) -> str: + """Render a single message at the given index into its encoded string form. + + This is the core function that converts each message in the conversation + into the DeepSeek-V4 format. + + Args: + index: Index of the message to render. + messages: Full list of messages in the conversation. + thinking_mode: Either "chat" or "thinking". + drop_thinking: Whether to drop reasoning content from earlier turns. + reasoning_effort: Optional reasoning effort level ("max", "high", or None). + + Returns: + Encoded string for this message. + """ + assert 0 <= index < len(messages) + assert thinking_mode in ['chat', 'thinking'], f'Invalid thinking_mode `{thinking_mode}`' + + prompt = '' + msg = messages[index] + last_user_idx = find_last_user_index(messages) + + role = msg.get('role') + content = msg.get('content') + tools = msg.get('tools') + response_format = msg.get('response_format') + tool_calls = msg.get('tool_calls') + reasoning_content = msg.get('reasoning_content') + wo_eos = msg.get('wo_eos', False) + + if tools: + tools = tools_from_openai_format(tools) + if tool_calls: + tool_calls = tool_calls_from_openai_format(tool_calls) + + # Reasoning effort prefix (only at index 0 in thinking mode with max effort) + assert reasoning_effort in ['max', None, 'high'], f'Invalid reasoning effort: {reasoning_effort}' + if index == 0 and thinking_mode == 'thinking' and reasoning_effort == 'max': + prompt += REASONING_EFFORT_MAX + + if role == 'system': + prompt += system_msg_template.format(content=content or '') + if tools: + prompt += '\n\n' + render_tools(tools) + if response_format: + prompt += '\n\n' + response_format_template.format(schema=to_json(response_format)) + + elif role == 'developer': + assert content, f'Invalid message for role `{role}`: {msg}' + + content_developer = USER_SP_TOKEN + content_developer += content + + if tools: + content_developer += '\n\n' + render_tools(tools) + if response_format: + content_developer += '\n\n' + response_format_template.format(schema=to_json(response_format)) + + prompt += user_msg_template.format(content=content_developer) + + elif role == 'user': + prompt += USER_SP_TOKEN + + # Handle content blocks (tool results mixed with text) + content_blocks = msg.get('content_blocks') + if content_blocks: + parts = [] + for block in content_blocks: + block_type = block.get('type') + if block_type == 'text': + parts.append(block.get('text', '')) + elif block_type == 'tool_result': + tool_content = block.get('content', '') + if isinstance(tool_content, list): + text_parts = [] + for b in tool_content: + if b.get('type') == 'text': + text_parts.append(b.get('text', '')) + else: + text_parts.append(f"[Unsupported {b.get('type')}]") + tool_content = '\n\n'.join(text_parts) + parts.append(tool_output_template.format(content=tool_content)) + else: + parts.append(f'[Unsupported {block_type}]') + prompt += '\n\n'.join(parts) + else: + prompt += content or '' + + elif role == 'latest_reminder': + prompt += LATEST_REMINDER_SP_TOKEN + latest_reminder_msg_template.format(content=content) + + elif role == 'tool': + raise NotImplementedError( + 'deepseek_v4 merges tool messages into user; please preprocess with merge_tool_messages()') + + elif role == 'assistant': + thinking_part = '' + tc_content = '' + + if tool_calls: + tc_list = [ + tool_call_template.format( + dsml_token=dsml_token, + name=tc.get('name'), + arguments=encode_arguments_to_dsml(tc) + ) + for tc in tool_calls + ] + tc_content += '\n\n' + tool_calls_template.format( + dsml_token=dsml_token, + tool_calls='\n'.join(tc_list), + tc_block_name=tool_calls_block_name, + ) + + summary_content = content or '' + rc = reasoning_content or '' + + # Check if previous message has a task - if so, this is a task output (no thinking) + prev_has_task = index - 1 >= 0 and messages[index - 1].get('task') is not None + + if thinking_mode == 'thinking' and not prev_has_task: + if not drop_thinking or index > last_user_idx: + thinking_part = thinking_template.format(reasoning_content=rc) + thinking_end_token + else: + thinking_part = '' + + if wo_eos: + prompt += assistant_msg_wo_eos_template.format( + reasoning=thinking_part, + content=summary_content, + tool_calls=tc_content, + ) + else: + prompt += assistant_msg_template.format( + reasoning=thinking_part, + content=summary_content, + tool_calls=tc_content, + ) + else: + raise NotImplementedError(f'Unknown role: {role}') + + # Append transition tokens based on what follows + if index + 1 < len(messages) and messages[index + 1].get('role') not in ['assistant', 'latest_reminder']: + return prompt + + task = messages[index].get('task') + if task is not None: + # Task special token for internal classification tasks + assert task in VALID_TASKS, f"Invalid task: '{task}'. Valid tasks are: {list(VALID_TASKS)}" + task_sp_token = DS_TASK_SP_TOKENS[task] + + if task != 'action': + # Non-action tasks: append task sp token directly after the message + prompt += task_sp_token + else: + # Action task: append Assistant + thinking token + action sp token + prompt += ASSISTANT_SP_TOKEN + prompt += thinking_end_token if thinking_mode != 'thinking' else thinking_start_token + prompt += task_sp_token + + elif messages[index].get('role') in ['user', 'developer']: + # Normal generation: append Assistant + thinking token + prompt += ASSISTANT_SP_TOKEN + if not drop_thinking and thinking_mode == 'thinking': + prompt += thinking_start_token + elif drop_thinking and thinking_mode == 'thinking' and index >= last_user_idx: + prompt += thinking_start_token + else: + prompt += thinking_end_token + + return prompt + + +# ============================================================ +# Preprocessing +# ============================================================ + +def merge_tool_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Merge tool messages into the preceding user message using content_blocks + format. + + DeepSeek-V4 does not have a standalone "tool" role; instead, tool results + are encoded as blocks within user messages. + + This function converts a standard OpenAI-format conversation (with separate + "tool" role messages) into V4 format where tool results are merged into + user messages. + + Args: + messages: List of message dicts in OpenAI format. + + Returns: + Processed message list with tool messages merged into user messages. + """ + merged: list[dict[str, Any]] = [] + + for msg in messages: + msg = copy.deepcopy(msg) + role = msg.get('role') + + if role == 'tool': + # Convert tool message to a user message with tool_result block + tool_block = { + 'type': 'tool_result', + 'tool_use_id': msg.get('tool_call_id', ''), + 'content': msg.get('content', ''), + } + # Merge into previous message if it's already a user (merged tool) + if merged and merged[-1].get('role') == 'user' and 'content_blocks' in merged[-1]: + merged[-1]['content_blocks'].append(tool_block) + else: + merged.append({ + 'role': 'user', + 'content_blocks': [tool_block], + }) + elif role == 'user': + text_block = {'type': 'text', 'text': msg.get('content', '')} + if (merged and merged[-1].get('role') == 'user' and 'content_blocks' in merged[-1] + and merged[-1].get('task') is None): + merged[-1]['content_blocks'].append(text_block) + else: + new_msg = { + 'role': 'user', + 'content': msg.get('content', ''), + 'content_blocks': [text_block], + } + # Preserve extra fields (task, wo_eos, mask, etc.) + for key in ('task', 'wo_eos', 'mask'): + if key in msg: + new_msg[key] = msg[key] + merged.append(new_msg) + else: + merged.append(msg) + + return merged + + +def sort_tool_results_by_call_order(messages: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Sort tool_result blocks within user messages by the order of tool_calls + in the preceding assistant message. + + Args: + messages: Preprocessed message list (after merge_tool_messages). + + Returns: + Message list with sorted tool result blocks. + """ + last_tool_call_order: dict[str, int] = {} + + for msg in messages: + role = msg.get('role') + if role == 'assistant' and msg.get('tool_calls'): + last_tool_call_order = {} + for idx, tc in enumerate(msg['tool_calls']): + tc_id = tc.get('id') or tc.get('function', {}).get('id', '') + if tc_id: + last_tool_call_order[tc_id] = idx + + elif role == 'user' and msg.get('content_blocks'): + tool_blocks = [b for b in msg['content_blocks'] if b.get('type') == 'tool_result'] + if len(tool_blocks) > 1 and last_tool_call_order: + sorted_blocks = sorted( + tool_blocks, + key=lambda b: last_tool_call_order.get(b.get('tool_use_id', ''), 0) + ) + sorted_idx = 0 + new_blocks = [] + for block in msg['content_blocks']: + if block.get('type') == 'tool_result': + new_blocks.append(sorted_blocks[sorted_idx]) + sorted_idx += 1 + else: + new_blocks.append(block) + msg['content_blocks'] = new_blocks + + return messages + + +# ============================================================ +# Main Encoding Function +# ============================================================ + +def encode_messages( + messages: list[dict[str, Any]], + thinking_mode: str, + context: list[dict[str, Any]] | None = None, + drop_thinking: bool = True, + add_default_bos_token: bool = True, + reasoning_effort: str | None = None, +) -> str: + """Encode a list of messages into the DeepSeek-V4 prompt format. + + This is the main entry point for encoding conversations. It handles: + - BOS token insertion + - Thinking mode with optional reasoning content dropping + - Tool message merging into user messages + - Multi-turn conversation context + + Args: + messages: List of message dicts to encode. + thinking_mode: Either "chat" or "thinking". + context: Optional preceding context messages (already encoded prefix). + drop_thinking: If True, drop reasoning_content from earlier assistant turns + (only keep reasoning for messages after the last user message). + add_default_bos_token: Whether to prepend BOS token at conversation start. + reasoning_effort: Optional reasoning effort level ("max", "high", or None). + + Returns: + The encoded prompt string. + """ + context = context if context else [] + + # Preprocess: merge tool messages and sort tool results + messages = merge_tool_messages(messages) + messages = sort_tool_results_by_call_order(context + messages)[len(context):] + if context: + context = merge_tool_messages(context) + context = sort_tool_results_by_call_order(context) + + full_messages = context + messages + + prompt = bos_token if add_default_bos_token and len(context) == 0 else '' + + # Resolve drop_thinking: if any message has tools defined, don't drop thinking + effective_drop_thinking = drop_thinking + if any(m.get('tools') for m in full_messages): + effective_drop_thinking = False + + if thinking_mode == 'thinking' and effective_drop_thinking: + full_messages = _drop_thinking_messages(full_messages) + # After dropping, recalculate how many messages to render + # (context may have shrunk too) + num_to_render = len(full_messages) - len(_drop_thinking_messages(context)) + context_len = len(full_messages) - num_to_render + else: + num_to_render = len(messages) + context_len = len(context) + + for idx in range(num_to_render): + prompt += render_message( + idx + context_len, + full_messages, + thinking_mode=thinking_mode, + drop_thinking=effective_drop_thinking, + reasoning_effort=reasoning_effort, + ) + + return prompt + + +def _drop_thinking_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Drop reasoning_content and non-essential messages before the last user + message. + + Behavior: + - Messages with role in ["user", "system", "tool", "latest_reminder"] are always kept. + - Messages at or after the last user index are always kept. + - Assistant messages before the last user get reasoning_content removed. + - Developer messages before the last user are dropped entirely. + """ + last_user_idx = find_last_user_index(messages) + result = [] + keep_roles = {'user', 'system', 'tool', 'latest_reminder', 'direct_search_results'} + + for idx, msg in enumerate(messages): + role = msg.get('role') + if role in keep_roles or idx >= last_user_idx: + result.append(msg) + elif role == 'assistant': + msg = copy.copy(msg) + msg.pop('reasoning_content', None) + result.append(msg) + # developer and other roles before last_user_idx are dropped + + return result + + +# ============================================================ +# Parsing (Decoding model output) +# ============================================================ + +def _read_until_stop(index: int, text: str, stop: list[str]) -> tuple[int, str, str | None]: + """Read text from index until one of the stop strings is found. + + Returns: + Tuple of (new_index, content_before_stop, matched_stop_string_or_None). + """ + min_pos = len(text) + matched_stop = None + + for s in stop: + pos = text.find(s, index) + if pos != -1 and pos < min_pos: + min_pos = pos + matched_stop = s + + if matched_stop: + content = text[index:min_pos] + return min_pos + len(matched_stop), content, matched_stop + else: + content = text[index:] + return len(text), content, None + + +def parse_tool_calls(index: int, text: str) -> tuple[int, str | None, list[dict[str, str]]]: + """Parse DSML tool calls from text starting at the given index. + + Args: + index: Starting position in text. + text: The full text to parse. + + Returns: + Tuple of (new_index, last_stop_token, list_of_tool_call_dicts). + Each tool call dict has "name" and "arguments" keys. + """ + tool_calls: list[dict[str, Any]] = [] + stop_token = None + tool_calls_end_token = f'' + + while index < len(text): + index, _, stop_token = _read_until_stop(index, text, [f'<{dsml_token}invoke', tool_calls_end_token]) + if _ != '>\n': + raise ValueError(f"Tool call format error: expected '>\\n' but got '{_}'") + + if stop_token == tool_calls_end_token: + break + + if stop_token is None: + raise ValueError('Missing special token in tool calls') + + index, tool_name_content, stop_token = _read_until_stop( + index, text, [f'<{dsml_token}parameter', f'\n$', tool_name_content, flags=re.DOTALL) + if len(p_tool_name) != 1: + raise ValueError(f"Tool name format error: '{tool_name_content}'") + tool_name = p_tool_name[0] + + tool_args: dict[str, tuple[str, str]] = {} + while stop_token == f'<{dsml_token}parameter': + index, param_content, stop_token = _read_until_stop(index, text, [f'/{dsml_token}parameter']) + + param_kv = re.findall(r'^ name="(.*?)" string="(true|false)">(.*?)<$', param_content, flags=re.DOTALL) + if len(param_kv) != 1: + raise ValueError(f"Parameter format error: '{param_content}'") + param_name, string, param_value = param_kv[0] + + if param_name in tool_args: + raise ValueError(f"Duplicate parameter name: '{param_name}'") + tool_args[param_name] = (param_value, string) + + index, content, stop_token = _read_until_stop( + index, text, [f'<{dsml_token}parameter', f'\n': + raise ValueError(f"Parameter format error: expected '>\\n' but got '{content}'") + + tool_call = decode_dsml_to_arguments(tool_name=tool_name, tool_args=tool_args) + tool_calls.append(tool_call) + + return index, stop_token, tool_calls + + +def parse_message_from_completion_text(text: str, thinking_mode: str) -> dict[str, Any]: + """Parse a model completion text into a structured assistant message. + + This function takes the raw text output from the model (a single assistant turn) + and extracts: + - reasoning_content (thinking block) + - content (summary/response) + - tool_calls (if any) + + NOTE: This function is designed to parse only correctly formatted strings and + will raise ValueError for malformed output. + + Args: + text: The raw completion text (including EOS token). + thinking_mode: Either "chat" or "thinking". + + Returns: + Dict with keys: "role", "content", "reasoning_content", "tool_calls". + tool_calls are in OpenAI format. + """ + summary_content, reasoning_content, tool_calls = '', '', [] + index, stop_token = 0, None + tool_calls_start_token = f'\n\n<{dsml_token}{tool_calls_block_name}' + + is_thinking = thinking_mode == 'thinking' + is_tool_calling = False + + if is_thinking: + index, content_delta, stop_token = _read_until_stop(index, text, [thinking_end_token, tool_calls_start_token]) + reasoning_content = content_delta + assert stop_token == thinking_end_token, 'Invalid thinking format: missing ' + + index, content_delta, stop_token = _read_until_stop(index, text, [eos_token, tool_calls_start_token]) + summary_content = content_delta + if stop_token == tool_calls_start_token: + is_tool_calling = True + else: + assert stop_token == eos_token, 'Invalid format: missing EOS token' + + if is_tool_calling: + index, stop_token, tool_calls = parse_tool_calls(index, text) + + index, tool_ends_text, stop_token = _read_until_stop(index, text, [eos_token]) + assert not tool_ends_text, 'Unexpected content after tool calls' + + assert len(text) == index and stop_token in [eos_token, None], 'Unexpected content at end' + + for sp_token in [bos_token, eos_token, thinking_start_token, thinking_end_token, dsml_token]: + assert sp_token not in summary_content and sp_token not in reasoning_content, \ + f"Unexpected special token '{sp_token}' in content" + + return { + 'role': 'assistant', + 'content': summary_content, + 'reasoning_content': reasoning_content, + 'tool_calls': tool_calls_to_openai_format(tool_calls) + } diff --git a/lmdeploy/model.py b/lmdeploy/model.py index d2394fec4c..d5e59b3fc1 100644 --- a/lmdeploy/model.py +++ b/lmdeploy/model.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import dataclasses import json +import os import uuid from typing import Literal @@ -642,6 +643,142 @@ def match(cls, model_path: str, **kwargs) -> str | None: return 'deepseek-vl2' +@MODELS.register_module(name=['deepseek-v4']) +class DeepseekV4ChatTemplate(BaseChatTemplate): + """Chat template of DeepSeek-V4 models.""" + + def __init__(self, eoa='<|end▁of▁sentence|>', stop_words=['<|end▁of▁sentence|>'], **kwargs): + super().__init__(eoa=eoa, stop_words=stop_words, **kwargs) + + def get_prompt(self, prompt, sequence_start=True, **kwargs): + messages = [{'role': 'user', 'content': prompt}] + return self.messages2prompt(messages, sequence_start, **kwargs) + + def messages2prompt(self, messages, sequence_start=True, **kwargs): + from lmdeploy.deepseek_v4_encoding import encode_messages + + if isinstance(messages, str): + messages = [{'role': 'user', 'content': messages}] + + tools = self._normalize_tools(kwargs.pop('tools', None)) + messages = self._with_tools(messages, tools) if tools else list(messages) + + reasoning_effort = kwargs.pop('reasoning_effort', None) + if reasoning_effort not in ('high', 'max'): + reasoning_effort = None + + thinking = kwargs.pop('thinking', False) + enable_thinking = kwargs.pop('enable_thinking', False) + thinking = thinking or enable_thinking + + drop_thinking = kwargs.pop('drop_thinking', True) + return encode_messages(messages, + thinking_mode='thinking' if thinking else 'chat', + drop_thinking=drop_thinking, + add_default_bos_token=sequence_start, + reasoning_effort=reasoning_effort) + + @staticmethod + def _normalize_tools(tools): + if not tools: + return None + + normalized = [] + for tool in tools: + if hasattr(tool, 'model_dump'): + tool = tool.model_dump() + if not isinstance(tool, dict): + continue + if 'function' in tool: + normalized.append(tool) + else: + normalized.append({'type': 'function', 'function': tool}) + return normalized or None + + @staticmethod + def _with_tools(messages, tools): + messages = [dict(message) for message in messages] + for message in messages: + if message.get('role') in ('system', 'developer'): + message['tools'] = tools + return messages + return [{'role': 'system', 'content': '', 'tools': tools}] + messages + + @classmethod + def match(cls, model_path: str, trust_remote_code: bool = False, **kwargs) -> str | None: + try: + arch, cfg = get_model_arch(model_path, trust_remote_code=trust_remote_code) + cfg_dict = cfg.to_dict() + except Exception: + cfg_dict = {} + config_path = os.path.join(model_path, 'config.json') + if os.path.exists(config_path): + try: + with open(config_path, encoding='utf-8') as f: + cfg_dict = json.load(f) + except Exception: + cfg_dict = {} + arch = (cfg_dict.get('architectures') or [None])[0] + + if arch == 'DeepseekV4ForCausalLM' or cfg_dict.get('model_type') == 'deepseek_v4': + return 'deepseek-v4' + return None + + +@MODELS.register_module(name=['deepseek-v32', 'deepseek-v3.2']) +class DeepseekV32ChatTemplate(BaseChatTemplate): + """Chat template of DeepSeek-V3.2 models.""" + + def __init__(self, eoa='<|end▁of▁sentence|>', stop_words=['<|end▁of▁sentence|>'], **kwargs): + super().__init__(eoa=eoa, stop_words=stop_words, **kwargs) + + def get_prompt(self, prompt, sequence_start=True, **kwargs): + messages = [{'role': 'user', 'content': prompt}] + return self.messages2prompt(messages, sequence_start, **kwargs) + + def messages2prompt(self, messages, sequence_start=True, **kwargs): + from lmdeploy.deepseek_v32_encoding import encode_messages + + if isinstance(messages, str): + messages = [{'role': 'user', 'content': messages}] + + tools = DeepseekV4ChatTemplate._normalize_tools(kwargs.pop('tools', None)) + messages = DeepseekV4ChatTemplate._with_tools(messages, tools) if tools else list(messages) + + thinking = kwargs.pop('thinking', False) + enable_thinking = kwargs.pop('enable_thinking', False) + thinking = thinking or enable_thinking + + drop_thinking = kwargs.pop('drop_thinking', None) + if drop_thinking is None: + drop_thinking = bool(messages and messages[-1].get('role') == 'user') + + return encode_messages(messages, + thinking_mode='thinking' if thinking else 'chat', + drop_thinking=drop_thinking, + add_default_bos_token=sequence_start) + + @classmethod + def match(cls, model_path: str, trust_remote_code: bool = False, **kwargs) -> str | None: + try: + arch, cfg = get_model_arch(model_path, trust_remote_code=trust_remote_code) + cfg_dict = cfg.to_dict() + except Exception: + cfg_dict = {} + config_path = os.path.join(model_path, 'config.json') + if os.path.exists(config_path): + try: + with open(config_path, encoding='utf-8') as f: + cfg_dict = json.load(f) + except Exception: + cfg_dict = {} + arch = (cfg_dict.get('architectures') or [None])[0] + + if arch == 'DeepseekV32ForCausalLM' or cfg_dict.get('model_type') == 'deepseek_v32': + return 'deepseek-v32' + return None + + @MODELS.register_module(name=['llava-chatml']) class ChatmlDirect(BaseChatTemplate): diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py index 68d4c51f85..44bfeacc45 100644 --- a/lmdeploy/serve/core/async_engine.py +++ b/lmdeploy/serve/core/async_engine.py @@ -481,7 +481,7 @@ async def generate( session_id: int | Session, gen_config: GenerationConfig | None = None, tools: list[object] | None = None, - reasoning_effort: Literal['low', 'medium', 'high'] | None = None, + reasoning_effort: Literal['low', 'medium', 'high', 'max'] | None = None, stream_response: bool = True, sequence_start: bool = True, sequence_end: bool = True, # no interactive mode by default diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py index 675cbf5103..72eb841cf5 100644 --- a/lmdeploy/serve/openai/protocol.py +++ b/lmdeploy/serve/openai/protocol.py @@ -172,7 +172,7 @@ class ChatCompletionRequest(BaseModel): presence_penalty: float | None = 0.0 frequency_penalty: float | None = 0.0 user: str | None = None - reasoning_effort: Literal['low', 'medium', 'high'] | None = None + reasoning_effort: Literal['low', 'medium', 'high', 'max'] | None = None response_format: ResponseFormat | None = Field(default=None, examples=[None]) # additional argument of lmdeploy do_preprocess: bool | None = True diff --git a/lmdeploy/serve/parsers/reasoning_parser/__init__.py b/lmdeploy/serve/parsers/reasoning_parser/__init__.py index 1f29020d7f..7b08f76525 100644 --- a/lmdeploy/serve/parsers/reasoning_parser/__init__.py +++ b/lmdeploy/serve/parsers/reasoning_parser/__init__.py @@ -1,5 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from .deepseek_v3_reasoning_parser import DeepSeekV3ReasoningParser +from .deepseek_v4_reasoning_parser import DeepSeekV4ReasoningParser +from .deepseek_v32_reasoning_parser import DeepSeekV32ReasoningParser from .reasoning_parser import LEGACY_REASONING_PARSER_NAMES, ReasoningParser, ReasoningParserManager __all__ = [ @@ -7,4 +9,6 @@ 'ReasoningParser', 'ReasoningParserManager', 'DeepSeekV3ReasoningParser', + 'DeepSeekV32ReasoningParser', + 'DeepSeekV4ReasoningParser', ] diff --git a/lmdeploy/serve/parsers/reasoning_parser/deepseek_v32_reasoning_parser.py b/lmdeploy/serve/parsers/reasoning_parser/deepseek_v32_reasoning_parser.py new file mode 100644 index 0000000000..b00f74ed34 --- /dev/null +++ b/lmdeploy/serve/parsers/reasoning_parser/deepseek_v32_reasoning_parser.py @@ -0,0 +1,15 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .reasoning_parser import ReasoningParser, ReasoningParserManager + + +@ReasoningParserManager.register_module(['deepseek-v32', 'deepseek-v3.2']) +class DeepSeekV32ReasoningParser(ReasoningParser): + """Reasoning parser for DeepSeek-V3.2 thinking mode.""" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.thinking = kwargs.get('thinking', None) + self.enable_thinking = kwargs.get('enable_thinking', None) + + def starts_in_reasoning_mode(self) -> bool: + return self.thinking is True or self.enable_thinking is True diff --git a/lmdeploy/serve/parsers/reasoning_parser/deepseek_v4_reasoning_parser.py b/lmdeploy/serve/parsers/reasoning_parser/deepseek_v4_reasoning_parser.py new file mode 100644 index 0000000000..a6ed06c1f8 --- /dev/null +++ b/lmdeploy/serve/parsers/reasoning_parser/deepseek_v4_reasoning_parser.py @@ -0,0 +1,15 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .reasoning_parser import ReasoningParser, ReasoningParserManager + + +@ReasoningParserManager.register_module('deepseek-v4') +class DeepSeekV4ReasoningParser(ReasoningParser): + """Reasoning parser for DeepSeek-V4 thinking mode.""" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.thinking = kwargs.get('thinking', None) + self.enable_thinking = kwargs.get('enable_thinking', None) + + def starts_in_reasoning_mode(self) -> bool: + return self.thinking is True or self.enable_thinking is True diff --git a/lmdeploy/serve/parsers/response_parser.py b/lmdeploy/serve/parsers/response_parser.py index dff1bc4b91..f7bce1aba7 100644 --- a/lmdeploy/serve/parsers/response_parser.py +++ b/lmdeploy/serve/parsers/response_parser.py @@ -274,6 +274,8 @@ def chat_template_kwargs_from_request(cls, request: ChatCompletionRequest) -> di else: logger.warning( '`enable_thinking` in `chat_template_kwargs` will override the value in request.') + if request.reasoning_effort in ('high', 'max'): + chat_template_kwargs.setdefault('reasoning_effort', request.reasoning_effort) return chat_template_kwargs def __init__(self, request: ChatCompletionRequest): @@ -281,6 +283,8 @@ def __init__(self, request: ChatCompletionRequest): tcls = type(self).tool_parser_cls self._kwargs = type(self).chat_template_kwargs_from_request(request) self.enable_thinking: bool | None = self._kwargs.get('enable_thinking', None) + if self._kwargs.get('thinking') is True: + self.enable_thinking = True self.reasoning_parser: ReasoningParser | None = rcls(**self._kwargs) if rcls else None self.tool_parser: ToolParser | None = tcls() if tcls else None if self.tool_parser is not None: @@ -672,8 +676,11 @@ def parse_complete( close_idx = n tool_payload = text[open_idx + len(open_tag):].strip() parsed_call = self.tool_parser.parse_tool_call_complete(tool_payload) if self.tool_parser else None - if parsed_call is not None: - tool_calls.append(parsed_call) + if parsed_call: + if isinstance(parsed_call, list): + tool_calls.extend(parsed_call) + else: + tool_calls.append(parsed_call) pos = close_idx + len(close_tag) if close_tag else n else: # Tool call parsing failed — fall back to plain text. diff --git a/lmdeploy/serve/parsers/tool_parser/__init__.py b/lmdeploy/serve/parsers/tool_parser/__init__.py index f5c547ac1e..f9c5ceaba7 100644 --- a/lmdeploy/serve/parsers/tool_parser/__init__.py +++ b/lmdeploy/serve/parsers/tool_parser/__init__.py @@ -1,4 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .deepseek_v4_tool_parser import DeepSeekV4ToolParser +from .deepseek_v32_tool_parser import DeepSeekV32ToolParser from .glm47_tool_parser import Glm47ToolParser from .internlm2_tool_parser import Internlm2ToolParser from .interns2preview_tool_parser import InternS2PreviewToolParser @@ -13,6 +15,8 @@ 'ToolParser', 'ToolParserManager', 'XmlToolParser', + 'DeepSeekV32ToolParser', + 'DeepSeekV4ToolParser', 'Glm47ToolParser', 'Internlm2ToolParser', 'Llama3JsonToolParser', diff --git a/lmdeploy/serve/parsers/tool_parser/deepseek_v32_tool_parser.py b/lmdeploy/serve/parsers/tool_parser/deepseek_v32_tool_parser.py new file mode 100644 index 0000000000..62f7f03a1d --- /dev/null +++ b/lmdeploy/serve/parsers/tool_parser/deepseek_v32_tool_parser.py @@ -0,0 +1,107 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from __future__ import annotations + +import shortuuid + +from lmdeploy.deepseek_v32_encoding import dsml_token, parse_tool_calls +from lmdeploy.serve.openai.protocol import ( + DeltaFunctionCall, + DeltaToolCall, + FunctionCall, + ToolCall, +) + +from .tool_parser import ToolParser, ToolParserManager + +TOOL_CALLS_BLOCK_NAME = 'function_calls' + + +@ToolParserManager.register_module(['deepseek-v32', 'deepseek-v3.2']) +class DeepSeekV32ToolParser(ToolParser): + """Tool parser for DeepSeek-V3.2 DSML function-call blocks.""" + + dsml_token = dsml_token + tool_calls_block_name = TOOL_CALLS_BLOCK_NAME + parse_tool_calls_func = staticmethod(parse_tool_calls) + + @classmethod + def get_tool_open_tag(cls) -> str | None: + return f'\n\n<{cls.dsml_token}{cls.tool_calls_block_name}>' + + @classmethod + def get_tool_close_tag(cls) -> str | None: + return f'' + + @classmethod + def get_tool_payload_format(cls) -> str: + return 'dsml' + + def decode_tool_incremental(self, added_text: str, *, final: bool) -> list[DeltaToolCall]: + self._tool_payload += added_text + if not final: + return [] + + tool_calls = self.parse_tool_call_complete(self._tool_payload) + if not tool_calls: + return [] + + out: list[DeltaToolCall] = [] + for offset, tool_call in enumerate(tool_calls): + index = self._active_tool_index + offset + out.append( + DeltaToolCall( + id=f'chatcmpl-tool-{shortuuid.random()}', + index=index, + type='function', + function=DeltaFunctionCall(name=tool_call.function.name), + )) + out.append( + DeltaToolCall( + id=None, + index=index, + type=None, + function=DeltaFunctionCall(arguments=tool_call.function.arguments), + )) + + self._active_tool_index += len(tool_calls) - 1 + return out + + def parse_tool_call_complete(self, payload: str) -> list[ToolCall] | None: + payload = payload.strip() + if not payload: + return None + + wrapped = f'{self.get_tool_open_tag()}\n{payload}\n{self.get_tool_close_tag()}' + start = len(self.get_tool_open_tag()) - 1 + try: + _, stop_token, raw_tool_calls = self.parse_tool_calls_func(start, wrapped) + except Exception: + return None + if stop_token != self.get_tool_close_tag() or not raw_tool_calls: + return None + + return [ + ToolCall(function=FunctionCall(name=tool_call['name'], arguments=tool_call['arguments'])) + for tool_call in raw_tool_calls + ] + + def validate_complete(self, text: str) -> bool: + open_tag = self.get_tool_open_tag() + close_tag = self.get_tool_close_tag() + + pos = 0 + while True: + open_idx = text.find(open_tag, pos) + close_idx = text.find(close_tag, pos) + if open_idx < 0: + return close_idx < 0 + + payload_start = open_idx + len(open_tag) + if close_idx < payload_start: + return False + if self.parse_tool_call_complete(text[payload_start:close_idx]) is None: + return False + + pos = close_idx + len(close_tag) + if pos >= len(text): + return True diff --git a/lmdeploy/serve/parsers/tool_parser/deepseek_v4_tool_parser.py b/lmdeploy/serve/parsers/tool_parser/deepseek_v4_tool_parser.py new file mode 100644 index 0000000000..9ee633b741 --- /dev/null +++ b/lmdeploy/serve/parsers/tool_parser/deepseek_v4_tool_parser.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from __future__ import annotations + +from lmdeploy.deepseek_v4_encoding import dsml_token, parse_tool_calls, tool_calls_block_name + +from .deepseek_v32_tool_parser import DeepSeekV32ToolParser +from .tool_parser import ToolParserManager + + +@ToolParserManager.register_module(['deepseek-v4']) +class DeepSeekV4ToolParser(DeepSeekV32ToolParser): + """Tool parser for DeepSeek-V4 DSML tool-call blocks.""" + + dsml_token = dsml_token + tool_calls_block_name = tool_calls_block_name + parse_tool_calls_func = staticmethod(parse_tool_calls) diff --git a/lmdeploy/serve/parsers/tool_parser/tool_parser.py b/lmdeploy/serve/parsers/tool_parser/tool_parser.py index f95b662a12..33b19cbd38 100644 --- a/lmdeploy/serve/parsers/tool_parser/tool_parser.py +++ b/lmdeploy/serve/parsers/tool_parser/tool_parser.py @@ -73,7 +73,7 @@ def decode_tool_incremental(self, added_text: str, *, final: bool) -> list[Delta """Decode incremental tool payload emitted between tool tags.""" raise NotImplementedError('ToolParser.decode_tool_incremental has not been implemented!') - def parse_tool_call_complete(self, payload: str) -> ToolCall | None: + def parse_tool_call_complete(self, payload: str) -> ToolCall | list[ToolCall] | None: """Parse one complete tool payload into OpenAI tool call object.""" raise NotImplementedError('ToolParser.parse_tool_call_complete has not been implemented!') diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py index cf2452935e..4917031787 100644 --- a/lmdeploy/serve/processors/multimodal.py +++ b/lmdeploy/serve/processors/multimodal.py @@ -186,7 +186,7 @@ async def get_prompt_input(self, sequence_start: bool, adapter_name: str, tools: list[object] | None = None, - reasoning_effort: Literal['low', 'medium', 'high'] | None = None, + reasoning_effort: Literal['low', 'medium', 'high', 'max'] | None = None, chat_template_kwargs: dict | None = None, media_io_kwargs: dict[str, Any] | None = None, mm_processor_kwargs: dict[str, Any] | None = None, @@ -346,7 +346,7 @@ async def _get_text_prompt_input(self, sequence_start: bool, adapter_name: str, tools: list[object] | None = None, - reasoning_effort: Literal['low', 'medium', 'high'] | None = None, + reasoning_effort: Literal['low', 'medium', 'high', 'max'] | None = None, chat_template_kwargs: dict | None = None, **kwargs): """Process text-only prompt and return prompt string and input_ids.""" diff --git a/tests/test_lmdeploy/test_deepseek_v32_encoding.py b/tests/test_lmdeploy/test_deepseek_v32_encoding.py new file mode 100644 index 0000000000..64e6723ea7 --- /dev/null +++ b/tests/test_lmdeploy/test_deepseek_v32_encoding.py @@ -0,0 +1,254 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json + +from lmdeploy.deepseek_v32_encoding import ( + bos_token, + encode_messages, + eos_token, + parse_message_from_completion_text, +) +from lmdeploy.model import MODELS, DeepseekV32ChatTemplate, get_chat_template +from lmdeploy.serve.openai.protocol import ChatCompletionRequest +from lmdeploy.serve.parsers import ResponseParserManager +from lmdeploy.serve.parsers.reasoning_parser import ReasoningParserManager +from lmdeploy.serve.parsers.tool_parser import ToolParserManager + +WEATHER_TOOL = { + 'type': 'function', + 'function': { + 'name': 'get_weather', + 'description': 'Get weather for a city.', + 'parameters': { + 'type': 'object', + 'properties': { + 'city': { + 'type': 'string' + } + }, + 'required': ['city'], + }, + }, +} + + +def test_deepseek_v32_minimal_chat_and_thinking_modes(): + messages = [{'role': 'user', 'content': 'Hello'}] + + assert encode_messages(messages, thinking_mode='chat') == ( + f'{bos_token}<|User|>Hello<|Assistant|>' + ) + assert encode_messages(messages, thinking_mode='thinking') == ( + f'{bos_token}<|User|>Hello<|Assistant|>' + ) + + +def test_deepseek_v32_uses_function_call_block(): + messages = [ + { + 'role': 'system', + 'content': 'You may call tools.', + 'tools': [WEATHER_TOOL], + }, + { + 'role': 'user', + 'content': 'Weather in Paris?', + }, + { + 'role': 'assistant', + 'reasoning_content': 'I should call the weather tool.', + 'tool_calls': [{ + 'type': 'function', + 'function': { + 'name': 'get_weather', + 'arguments': '{"city": "Paris"}', + }, + }], + }, + ] + + prompt = encode_messages(messages, thinking_mode='thinking', drop_thinking=False) + + assert '## Tools' in prompt + assert '"name": "get_weather"' in prompt + assert '<|DSML|function_calls>' in prompt + assert '' in prompt + assert '<|DSML|tool_calls>' not in prompt + assert '<|DSML|parameter name="city" string="true">Paris' in prompt + + +def test_deepseek_v32_tool_results_reopen_thinking(): + messages = [ + { + 'role': 'user', + 'content': 'Weather in Paris?', + }, + { + 'role': 'assistant', + 'tool_calls': [{ + 'id': 'call_1', + 'type': 'function', + 'function': { + 'name': 'get_weather', + 'arguments': '{"city": "Paris"}', + }, + }], + }, + { + 'role': 'tool', + 'tool_call_id': 'call_1', + 'content': 'Sunny', + }, + ] + + prompt = encode_messages(messages, thinking_mode='thinking', drop_thinking=False) + + assert '\nSunny\n\n\n' in prompt + + +def test_deepseek_v32_parse_completion_text(): + completion = ( + 'I should call a tool.\n\n' + '<|DSML|function_calls>\n' + '<|DSML|invoke name="get_weather">\n' + '<|DSML|parameter name="city" string="true">Paris\n' + '\n' + '' + f'{eos_token}' + ) + + parsed = parse_message_from_completion_text(completion, thinking_mode='thinking') + + assert parsed['reasoning_content'] == 'I should call a tool.' + assert parsed['content'] == '' + assert parsed['tool_calls'][0]['function']['name'] == 'get_weather' + assert json.loads(parsed['tool_calls'][0]['function']['arguments']) == {'city': 'Paris'} + + +def test_deepseek_v32_chat_template_uses_vllm_thinking_switches(): + model = MODELS.get('deepseek-v32')() + assert model.messages2prompt([{'role': 'user', 'content': 'Hello'}]) == ( + f'{bos_token}<|User|>Hello<|Assistant|>' + ) + assert model.messages2prompt([{'role': 'user', 'content': 'Hello'}], thinking=True) == ( + f'{bos_token}<|User|>Hello<|Assistant|>' + ) + assert model.messages2prompt([{'role': 'user', 'content': 'Hello'}], enable_thinking=True) == ( + f'{bos_token}<|User|>Hello<|Assistant|>' + ) + + +def test_deepseek_v32_chat_template_normalizes_lmdeploy_tools_and_dict_arguments(): + model = MODELS.get('deepseek-v32')() + prompt = model.messages2prompt( + [ + {'role': 'user', 'content': 'List files'}, + { + 'role': 'assistant', + 'tool_calls': [ + { + 'type': 'function', + 'function': { + 'name': 'str_replace_editor', + 'arguments': { + 'command': 'view', + 'path': '/testbed', + }, + }, + } + ], + }, + ], + tools=[ + { + 'name': 'str_replace_editor', + 'description': 'Edit files', + 'parameters': { + 'type': 'object', + 'properties': { + 'command': { + 'type': 'string' + }, + 'path': { + 'type': 'string' + }, + }, + 'required': ['command', 'path'], + }, + } + ], + enable_thinking=True, + drop_thinking=False, + ) + assert '## Tools' in prompt + assert '<|DSML|function_calls>' in prompt + assert '<|DSML|tool_calls>' not in prompt + assert '"name": "str_replace_editor"' in prompt + assert '<|DSML|parameter name="command" string="true">view' in prompt + assert '<|DSML|parameter name="path" string="true">/testbed' in prompt + assert 'parameter name="arguments"' not in prompt + + +def test_deepseek_v32_chat_template_match_minimal_config(tmp_path): + (tmp_path / 'config.json').write_text( + json.dumps({ + 'model_type': 'deepseek_v32', + 'architectures': ['DeepseekV32ForCausalLM'], + }), + encoding='utf-8', + ) + assert DeepseekV32ChatTemplate.match(str(tmp_path)) == 'deepseek-v32' + assert isinstance(get_chat_template(str(tmp_path)), DeepseekV32ChatTemplate) + + +def _make_response_parser(thinking=True): + cls = ResponseParserManager.get('default') + cls.reasoning_parser_cls = ReasoningParserManager.get('deepseek-v32') + cls.tool_parser_cls = ToolParserManager.get('deepseek-v32') + request = ChatCompletionRequest( + model='deepseek-ai/DeepSeek-V3.2', + messages=[], + stream=True, + chat_template_kwargs={'thinking': thinking}, + ) + return cls(request=request) + + +def test_deepseek_v32_response_parser_complete_dsml_function_calls(): + completion = ( + 'I should call a tool.\n\n' + '<|DSML|function_calls>\n' + '<|DSML|invoke name="get_weather">\n' + '<|DSML|parameter name="city" string="true">Paris\n' + '\n' + '' + ) + + parser = _make_response_parser(thinking=True) + content, tool_calls, reasoning_content = parser.parse_complete(completion) + assert content is None + assert reasoning_content == 'I should call a tool.' + assert tool_calls is not None + assert len(tool_calls) == 1 + assert tool_calls[0].function.name == 'get_weather' + assert json.loads(tool_calls[0].function.arguments) == {'city': 'Paris'} + assert parser.validate_complete(completion) + + +def test_deepseek_v32_response_parser_streaming_dsml_function_calls(): + text = ( + 'need data\n\n' + '<|DSML|function_calls>\n' + '<|DSML|invoke name="search">\n' + '<|DSML|parameter name="query" string="true">DeepSeek V3.2\n' + '\n' + '' + ) + parser = _make_response_parser(thinking=True) + + deltas = parser.stream_chunk(delta_text=text, delta_token_ids=[]) + reasoning = ''.join(delta.reasoning_content or '' for delta, _ in deltas) + tool_deltas = [tool_call for delta, _ in deltas for tool_call in (delta.tool_calls or [])] + + assert reasoning == 'need data' + assert tool_deltas[0].function.name == 'search' + assert json.loads(tool_deltas[1].function.arguments) == {'query': 'DeepSeek V3.2'} diff --git a/tests/test_lmdeploy/test_deepseek_v4_encoding.py b/tests/test_lmdeploy/test_deepseek_v4_encoding.py new file mode 100644 index 0000000000..5acdad967f --- /dev/null +++ b/tests/test_lmdeploy/test_deepseek_v4_encoding.py @@ -0,0 +1,274 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json + +from lmdeploy.deepseek_v4_encoding import ( + REASONING_EFFORT_MAX, + bos_token, + encode_messages, + eos_token, + parse_message_from_completion_text, +) +from lmdeploy.model import MODELS, DeepseekV4ChatTemplate, get_chat_template +from lmdeploy.serve.openai.protocol import ChatCompletionRequest +from lmdeploy.serve.parsers import ResponseParserManager +from lmdeploy.serve.parsers.reasoning_parser import ReasoningParserManager +from lmdeploy.serve.parsers.tool_parser import ToolParserManager + +WEATHER_TOOL = { + 'type': 'function', + 'function': { + 'name': 'get_weather', + 'description': 'Get weather for a city.', + 'parameters': { + 'type': 'object', + 'properties': { + 'city': { + 'type': 'string' + } + }, + 'required': ['city'], + }, + }, +} + + +def test_deepseek_v4_minimal_chat_and_thinking_modes(): + messages = [{'role': 'user', 'content': 'Hello'}] + + assert encode_messages(messages, thinking_mode='chat') == ( + f'{bos_token}<|User|>Hello<|Assistant|>' + ) + assert encode_messages(messages, thinking_mode='thinking') == ( + f'{bos_token}<|User|>Hello<|Assistant|>' + ) + + +def test_deepseek_v4_uses_v4_tool_call_block(): + messages = [ + { + 'role': 'system', + 'content': 'You may call tools.', + 'tools': [WEATHER_TOOL], + }, + { + 'role': 'user', + 'content': 'Weather in Paris?', + }, + { + 'role': 'assistant', + 'reasoning_content': 'I should call the weather tool.', + 'tool_calls': [{ + 'type': 'function', + 'function': { + 'name': 'get_weather', + 'arguments': '{"city": "Paris"}', + }, + }], + }, + ] + + prompt = encode_messages(messages, thinking_mode='thinking', drop_thinking=False) + + assert '## Tools' in prompt + assert '"name": "get_weather"' in prompt + assert '<|DSML|tool_calls>' in prompt + assert '' in prompt + assert '<|DSML|function_calls>' not in prompt + assert '<|DSML|parameter name="city" string="true">Paris' in prompt + + +def test_deepseek_v4_merges_tool_results_into_user_blocks(): + messages = [ + { + 'role': 'user', + 'content': 'Weather in Paris?', + }, + { + 'role': 'assistant', + 'tool_calls': [{ + 'id': 'call_1', + 'type': 'function', + 'function': { + 'name': 'get_weather', + 'arguments': '{"city": "Paris"}', + }, + }], + }, + { + 'role': 'tool', + 'tool_call_id': 'call_1', + 'content': 'Sunny', + }, + ] + + prompt = encode_messages(messages, thinking_mode='chat') + + assert 'Sunny' in prompt + assert prompt.index('<|DSML|tool_calls>') < prompt.index('Sunny') + + +def test_deepseek_v4_task_and_latest_reminder_rendering(): + prompt = encode_messages( + [ + { + 'role': 'latest_reminder', + 'content': 'Be terse.', + }, + { + 'role': 'user', + 'content': 'Classify this page.', + 'task': 'domain', + }, + ], + thinking_mode='chat', + ) + + assert '<|latest_reminder|>Be terse.' in prompt + assert '<|User|>Classify this page.<|domain|>' in prompt + assert '<|Assistant|>' not in prompt + + +def test_deepseek_v4_parse_completion_text(): + completion = ( + 'I should call a tool.\n\n' + '<|DSML|tool_calls>\n' + '<|DSML|invoke name="get_weather">\n' + '<|DSML|parameter name="city" string="true">Paris\n' + '\n' + '' + f'{eos_token}' + ) + + parsed = parse_message_from_completion_text(completion, thinking_mode='thinking') + + assert parsed['reasoning_content'] == 'I should call a tool.' + assert parsed['content'] == '' + assert parsed['tool_calls'][0]['function']['name'] == 'get_weather' + assert json.loads(parsed['tool_calls'][0]['function']['arguments']) == {'city': 'Paris'} + + +def test_deepseek_v4_chat_template_normalizes_lmdeploy_tools(): + model = MODELS.get('deepseek-v4')() + prompt = model.messages2prompt( + [ + {'role': 'system', 'content': 'You are a helpful assistant.'}, + {'role': 'user', 'content': "What's the weather in Beijing?"}, + ], + tools=[ + { + 'name': 'get_weather', + 'description': 'Get weather for a location.', + 'parameters': { + 'type': 'object', + 'properties': { + 'location': { + 'type': 'string' + } + }, + 'required': ['location'], + }, + } + ], + enable_thinking=True, + reasoning_effort='max', + ) + assert prompt.startswith(bos_token + REASONING_EFFORT_MAX) + assert '## Tools' in prompt + assert '"name": "get_weather"' in prompt + assert prompt.endswith('<|Assistant|>') + + +def test_deepseek_v4_reasoning_effort_does_not_enable_thinking(): + model = MODELS.get('deepseek-v4')() + prompt = model.messages2prompt( + [{'role': 'user', 'content': 'Hello'}], + reasoning_effort='max', + ) + assert REASONING_EFFORT_MAX not in prompt + assert prompt == f'{bos_token}<|User|>Hello<|Assistant|>' + + +def test_deepseek_v4_chat_template_match_minimal_config(tmp_path): + (tmp_path / 'config.json').write_text( + json.dumps({ + 'model_type': 'deepseek_v4', + 'architectures': ['DeepseekV4ForCausalLM'], + }), + encoding='utf-8', + ) + assert DeepseekV4ChatTemplate.match(str(tmp_path)) == 'deepseek-v4' + assert isinstance(get_chat_template(str(tmp_path)), DeepseekV4ChatTemplate) + + +def _make_response_parser(thinking=True): + cls = ResponseParserManager.get('default') + cls.reasoning_parser_cls = ReasoningParserManager.get('deepseek-v4') + cls.tool_parser_cls = ToolParserManager.get('deepseek-v4') + request = ChatCompletionRequest( + model='deepseek-ai/DeepSeek-V4', + messages=[], + stream=True, + chat_template_kwargs={'thinking': thinking}, + ) + return cls(request=request) + + +def test_deepseek_v4_response_parser_complete_dsml_tool_call(): + completion = ( + 'I should call a tool.\n\n' + '<|DSML|tool_calls>\n' + '<|DSML|invoke name="get_weather">\n' + '<|DSML|parameter name="city" string="true">Paris\n' + '\n' + '' + ) + + parser = _make_response_parser(thinking=True) + content, tool_calls, reasoning_content = parser.parse_complete(completion) + assert content is None + assert reasoning_content == 'I should call a tool.' + assert tool_calls is not None + assert len(tool_calls) == 1 + assert tool_calls[0].function.name == 'get_weather' + assert json.loads(tool_calls[0].function.arguments) == {'city': 'Paris'} + assert parser.validate_complete(completion) + + +def test_deepseek_v4_response_parser_streaming_dsml_tool_call(): + text = ( + 'need a tool\n\n' + '<|DSML|tool_calls>\n' + '<|DSML|invoke name="search">\n' + '<|DSML|parameter name="query" string="true">DeepSeek V4\n' + '\n' + '' + ) + parser = _make_response_parser(thinking=True) + + deltas = parser.stream_chunk(delta_text=text, delta_token_ids=[]) + reasoning = ''.join(delta.reasoning_content or '' for delta, _ in deltas) + tool_deltas = [tool_call for delta, _ in deltas for tool_call in (delta.tool_calls or [])] + + assert reasoning == 'need a tool' + assert tool_deltas[0].function.name == 'search' + assert json.loads(tool_deltas[1].function.arguments) == {'query': 'DeepSeek V4'} + + +def test_deepseek_v4_response_parser_reasoning_effort_does_not_enable_thinking(): + cls = ResponseParserManager.get('default') + cls.reasoning_parser_cls = ReasoningParserManager.get('deepseek-v4') + cls.tool_parser_cls = None + request = ChatCompletionRequest( + model='deepseek-ai/DeepSeek-V4', + messages=[], + stream=True, + reasoning_effort='max', + ) + parser = cls(request=request) + + deltas = parser.stream_chunk(delta_text='hello', delta_token_ids=[]) + assert len(deltas) == 1 + delta, tool_emitted = deltas[0] + assert tool_emitted is False + assert delta.content == 'hello' + assert delta.reasoning_content is None