diff --git a/lmdeploy/deepseek_v32_encoding.py b/lmdeploy/deepseek_v32_encoding.py
new file mode 100644
index 0000000000..afeac6e34b
--- /dev/null
+++ b/lmdeploy/deepseek_v32_encoding.py
@@ -0,0 +1,394 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapted from deepseek-ai/DeepSeek-V3.2 encoding/encoding_dsv32.py.
+import copy
+import json
+import re
+from typing import Any
+
+TOOLS_SYSTEM_TEMPLATE = (
+ '## Tools\n\n'
+ "You have access to a set of tools you can use to answer the user's question.\n"
+ 'You can invoke functions by writing a "<{dsml_token}function_calls>" block like the following as part of '
+ 'your reply to the user:\n'
+ '<{dsml_token}function_calls>\n'
+ '<{dsml_token}invoke name="$FUNCTION_NAME">\n'
+ '<{dsml_token}parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE{dsml_token}parameter>\n'
+ '...\n'
+ '{dsml_token}invoke>\n'
+ '<{dsml_token}invoke name="$FUNCTION_NAME2">\n'
+ '...\n'
+ '{dsml_token}invoke>\n'
+ '{dsml_token}function_calls>\n\n'
+ 'String and scalar parameters should be specified as is without any escaping or quotes, while lists and objects '
+ 'should use JSON format. The "string" attribute should be set to "true" for string type parameters and "false" '
+ 'for other types (numbers, booleans, arrays, objects).\n\n'
+ 'If the thinking_mode is enabled, then after function results you should strongly consider outputting a thinking '
+ 'block. Here is an example:\n\n'
+ '<{dsml_token}function_calls>\n'
+ '...\n'
+ '{dsml_token}function_calls>\n\n'
+ '\n'
+ '...\n'
+ '\n\n'
+ '{thinking_start_token}...thinking about results{thinking_end_token}\n\n'
+ 'Here are the functions available in JSONSchema format:\n'
+ '\n'
+ '{tool_schemas}\n'
+ '\n'
+)
+
+bos_token: str = '<|begin▁of▁sentence|>'
+eos_token: str = '<|end▁of▁sentence|>'
+thinking_start_token: str = ''
+thinking_end_token: str = ''
+dsml_token: str = '|DSML|'
+system_msg_template: str = '{content}'
+user_msg_template: str = '<|User|>{content}<|Assistant|>'
+assistant_msg_template: str = '{reasoning}{content}{tool_calls}<|end▁of▁sentence|>'
+thinking_template = '{reasoning_content}'
+
+response_format_template: str = (
+ '## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}'
+)
+tool_call_template: str = (
+ "<{dsml_token}invoke name=\"{name}\">\n{arguments}\n{dsml_token}invoke>"
+)
+tool_calls_template = (
+ '<{dsml_token}function_calls>\n{tool_calls}\n{dsml_token}function_calls>'
+)
+
+tool_output_template: str = (
+ '\n{content}'
+)
+
+def to_json(value: Any) -> str:
+ try:
+ return json.dumps(value, ensure_ascii=False)
+ except Exception:
+ return json.dumps(value, ensure_ascii=True)
+
+def tools_from_openai_format(tools):
+ return [tool['function'] for tool in tools]
+
+def tool_calls_from_openai_format(tool_calls):
+ return [
+ {
+ 'name': tool_call['function']['name'],
+ 'arguments': tool_call['function']['arguments'],
+ }
+ for tool_call in tool_calls
+ ]
+
+def tool_calls_to_openai_format(tool_calls):
+ return [
+ {
+ 'type': 'function',
+ 'function': {
+ 'name': tool_call['name'],
+ 'arguments': tool_call['arguments'],
+ }
+ }
+ for tool_call in tool_calls
+ ]
+
+def encode_arguments_to_dsml(tool_call: dict[str, str]) -> str:
+ p_dsml_template = """<{dsml_token}parameter name="{key}" string="{is_str}">{value}{dsml_token}parameter>"""
+ P_dsml_strs = []
+
+ raw_arguments = tool_call['arguments']
+ arguments = json.loads(raw_arguments) if isinstance(raw_arguments, str) else raw_arguments
+ if not isinstance(arguments, dict):
+ raise ValueError('Assistant tool call function.arguments must be a JSON object.')
+
+ for k, v in arguments.items():
+ p_dsml_str = p_dsml_template.format(
+ dsml_token=dsml_token,
+ key=k,
+ is_str='true' if isinstance(v, str) else 'false',
+ value=v if isinstance(v, str) else to_json(v),
+ )
+
+ P_dsml_strs.append(p_dsml_str)
+
+ return '\n'.join(P_dsml_strs)
+
+
+def decode_dsml_to_arguments(tool_name: str, tool_args: dict[str, tuple[str, str]]) -> dict[str, str]:
+ def _decode_value(key: str, value: str, string: str):
+ if string == 'true':
+ value = to_json(value)
+ return f'{to_json(key)}: {value}'
+
+ tool_args_json = '{' + ', '.join([_decode_value(k, v, string=is_str) for k, (v, is_str) in tool_args.items()]) + '}'
+ return dict(name=tool_name, arguments=tool_args_json)
+
+def render_tools(tools: list[dict[str, str | dict[str, Any]]]) -> str:
+ tools_json = [to_json(t) for t in tools]
+
+ return TOOLS_SYSTEM_TEMPLATE.format(
+ tool_schemas='\n'.join(tools_json),
+ dsml_token=dsml_token,
+ thinking_start_token=thinking_start_token,
+ thinking_end_token=thinking_end_token,
+ )
+
+def find_last_user_index(messages: list[dict[str, Any]]) -> int:
+ last_user_index = -1
+ for idx in range(len(messages)-1, -1, -1):
+ if messages[idx].get('role') in ['user', 'developer']:
+ last_user_index = idx
+ break
+ return last_user_index
+
+def render_message(index: int, messages: list[dict[str, Any]], thinking_mode: str) -> str:
+ assert 0 <= index < len(messages)
+ assert thinking_mode in ['chat', 'thinking'], f'Invalid thinking_mode `{thinking_mode}`'
+
+ prompt = ''
+ msg = messages[index]
+ last_user_idx = find_last_user_index(messages)
+
+ role = msg.get('role')
+ content = msg.get('content')
+ tools = msg.get('tools')
+ response_format = msg.get('response_format')
+ tool_calls = msg.get('tool_calls')
+ reasoning_content = msg.get('reasoning_content')
+
+ if tools:
+ tools = tools_from_openai_format(tools)
+ if tool_calls:
+ tool_calls = tool_calls_from_openai_format(tool_calls)
+
+ if role == 'system':
+ prompt += system_msg_template.format(content=content or '')
+ if tools:
+ prompt += '\n\n' + render_tools(tools)
+
+ if response_format:
+ prompt += '\n\n' + response_format_template.format(schema=to_json(response_format))
+
+ elif role == 'developer':
+ assert content, f'Invalid message for role `{role}`: {msg}'
+ content_developer = ''
+ if tools:
+ content_developer += '\n\n' + render_tools(tools)
+
+ if response_format:
+ content_developer += '\n\n' + response_format_template.format(schema=to_json(response_format))
+
+ content_developer += f"\n\n# The user's message is: {content}"
+
+ prompt += user_msg_template.format(content=content_developer)
+ if index == last_user_idx and thinking_mode == 'thinking':
+ prompt += thinking_start_token
+ else:
+ prompt += thinking_end_token
+
+ elif role == 'user':
+ prompt += user_msg_template.format(content=content)
+
+ if index == last_user_idx and thinking_mode == 'thinking':
+ prompt += thinking_start_token
+ else:
+ prompt += thinking_end_token
+
+ elif role == 'tool':
+ prev_assistant_idx = index - 1
+ assistant_msg = messages[prev_assistant_idx]
+ while prev_assistant_idx >= 0 and assistant_msg.get('role') == 'tool':
+ prev_assistant_idx -= 1
+ assistant_msg = messages[prev_assistant_idx]
+
+ assert (
+ index == 0 or prev_assistant_idx >= 0 and assistant_msg.get('role') == 'assistant'
+ ), f'Invalid messages at {index}:\n{assistant_msg}'
+
+ tool_call_order = index - prev_assistant_idx
+ assistant_tool_calls = assistant_msg.get('tool_calls')
+ assert (
+ assistant_tool_calls and len(assistant_tool_calls) >= tool_call_order
+ ), 'No tool calls but found tool output'
+
+ if tool_call_order == 1:
+ prompt += '\n\n'
+
+ prompt += tool_output_template.format(content=content)
+
+ if tool_call_order == len(assistant_tool_calls):
+ prompt += '\n'
+
+ if index >= last_user_idx and thinking_mode == 'thinking':
+ prompt += '\n\n' + thinking_start_token
+ else:
+ prompt += '\n\n' + thinking_end_token
+
+ elif role == 'assistant':
+ prev_assistant_idx = index
+ thinking_part = ''
+
+ tool_calls_content = ''
+ if tool_calls:
+ tool_calls = [
+ tool_call_template.format(
+ dsml_token=dsml_token,
+ name=tool_call.get('name'),
+ arguments=encode_arguments_to_dsml(tool_call)
+ )
+ for tool_call in tool_calls
+ ]
+ tool_calls_content += '\n\n' + tool_calls_template.format(
+ dsml_token=dsml_token,
+ tool_calls='\n'.join(tool_calls)
+ )
+
+ summary_content = content or ''
+
+ if thinking_mode == 'thinking' and index > last_user_idx:
+ assert reasoning_content or tool_calls, (
+ f'ThinkingMode: {thinking_mode}, invalid message without reasoning_content/tool_calls `{msg}` '
+ 'after last user message')
+ thinking_part = thinking_template.format(reasoning_content=reasoning_content or '') + thinking_end_token
+
+ prompt += assistant_msg_template.format(
+ reasoning=thinking_part,
+ content=summary_content,
+ tool_calls=tool_calls_content,
+ )
+ else:
+ raise NotImplementedError(f'Unknown role: {role}')
+
+ return prompt
+
+def drop_thinking_messages(messages: list[dict[str, Any]], last_user_idx: int | None = None) -> list[dict[str, Any]]:
+ messages_wo_thinking: list[dict[str, Any]] = []
+ last_user_idx = find_last_user_index(messages) if last_user_idx is None else last_user_idx
+ for idx, msg in enumerate(messages):
+ role = msg.get('role')
+ if role in ['user', 'system', 'tool'] or idx >= last_user_idx:
+ messages_wo_thinking.append(msg)
+ continue
+
+ elif role == 'assistant':
+ msg_wo_thinking = copy.copy(msg)
+ msg_wo_thinking.pop('reasoning_content', None)
+ messages_wo_thinking.append(msg_wo_thinking)
+
+ return messages_wo_thinking
+
+def encode_messages(messages: list[dict[str, Any]],
+ thinking_mode: str,
+ context: list[dict[str, Any]] | None = None,
+ drop_thinking: bool = True,
+ add_default_bos_token: bool = True) -> str:
+ context = context if context else []
+ full_messages = context + messages
+
+ prompt = bos_token if add_default_bos_token and len(context) == 0 else ''
+
+ if thinking_mode == 'thinking' and drop_thinking:
+ full_messages = drop_thinking_messages(full_messages)
+
+ for idx in range(len(messages)):
+ prompt += render_message(idx + len(context), full_messages, thinking_mode=thinking_mode)
+
+ return prompt
+
+def _read_until_stop(index: int, text: str, stop: list[str]) -> tuple[int, str, str | None]:
+ min_pos = len(text)
+ matched_stop = None
+
+ for s in stop:
+ pos = text.find(s, index)
+ if pos != -1 and pos < min_pos:
+ min_pos = pos
+ matched_stop = s
+
+ if matched_stop:
+ content = text[index:min_pos]
+ return min_pos + len(matched_stop), content, matched_stop
+ else:
+ content = text[index:]
+ return len(text), content, None
+
+def parse_tool_calls(index: int, text: str):
+ tool_calls: list[dict[str, Any]] = []
+ stop_token = None
+ tool_calls_end_token = f'{dsml_token}function_calls>'
+
+ while index < len(text):
+ index, _, stop_token = _read_until_stop(index, text, [f'<{dsml_token}invoke', tool_calls_end_token])
+ assert _ == '>\n', 'Tool call format error'
+
+ if stop_token == tool_calls_end_token:
+ break
+
+ assert stop_token is not None, 'Missing special token'
+
+ index, tool_name_content, stop_token = _read_until_stop(
+ index, text, [f'<{dsml_token}parameter', f'{dsml_token}invoke'])
+
+ p_tool_name = re.findall(r'^\s*name="(.*?)">\n$', tool_name_content, flags=re.DOTALL)
+ assert len(p_tool_name) == 1, 'Tool name format error'
+ tool_name = p_tool_name[0]
+
+ tool_args: dict[str, tuple[str, str]] = {}
+ while stop_token == f'<{dsml_token}parameter':
+ index, param_content, stop_token = _read_until_stop(index, text, [f'/{dsml_token}parameter'])
+
+ param_kv = re.findall(r'^ name="(.*?)" string="(true|false)">(.*?)<$', param_content, flags=re.DOTALL)
+ assert len(param_kv) == 1, 'Parameter format error'
+ param_name, string, param_value = param_kv[0]
+
+ assert param_name not in tool_args, 'Duplicate parameter name'
+ tool_args[param_name] = (param_value, string)
+
+ index, content, stop_token = _read_until_stop(
+ index, text, [f'<{dsml_token}parameter', f'{dsml_token}invoke'])
+ assert content == '>\n', 'Parameter format error'
+
+ tool_call = decode_dsml_to_arguments(tool_name=tool_name, tool_args=tool_args)
+ tool_calls.append(tool_call)
+
+ return index, stop_token, tool_calls
+
+# NOTE: This function parses only correctly formatted strings and will not attempt to correct
+# malformed output that may be generated by the model.
+def parse_message_from_completion_text(text: str, thinking_mode: str):
+ summary_content, reasoning_content, tool_calls = '', '', []
+ index, stop_token = 0, None
+ tool_calls_start_token = f'\n\n<{dsml_token}function_calls'
+
+ is_thinking, is_tool_calling = thinking_mode == 'thinking', False
+
+ if is_thinking:
+ index, content_delta, stop_token = _read_until_stop(index, text, [thinking_end_token, tool_calls_start_token])
+ reasoning_content = content_delta
+ assert stop_token == thinking_end_token, 'Invalid thinking format'
+
+ index, content_delta, stop_token = _read_until_stop(index, text, [eos_token, tool_calls_start_token])
+ summary_content = content_delta
+ if stop_token == tool_calls_start_token:
+ is_tool_calling = True
+ else:
+ assert stop_token == eos_token, 'Invalid summary format'
+
+ if is_tool_calling:
+ index, stop_token, tool_calls = parse_tool_calls(index, text)
+
+ index, tool_ends_text, stop_token = _read_until_stop(index, text, [eos_token])
+ assert not tool_ends_text, 'Unexpected content after tool calls'
+
+ assert len(text) == index and stop_token in [eos_token, None], 'Unexpected content at end'
+
+ for sp_token in [bos_token, eos_token, thinking_start_token, thinking_end_token, dsml_token]:
+ assert (
+ sp_token not in summary_content and sp_token not in reasoning_content
+ ), 'Unexpected special token in content'
+
+ return {
+ 'role': 'assistant',
+ 'content': summary_content,
+ 'reasoning_content': reasoning_content,
+ 'tool_calls': tool_calls_to_openai_format(tool_calls)
+ }
diff --git a/lmdeploy/deepseek_v4_encoding.py b/lmdeploy/deepseek_v4_encoding.py
new file mode 100644
index 0000000000..c24c4c5584
--- /dev/null
+++ b/lmdeploy/deepseek_v4_encoding.py
@@ -0,0 +1,743 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapted from deepseek-ai/DeepSeek-V4-Pro encoding/encoding_dsv4.py.
+"""DeepSeek-V4 Encoding.
+
+A self-contained implementation for encoding/decoding DeepSeek-V4 chat messages with tool calling, thinking mode, and
+quick instruction task support.
+"""
+
+import copy
+import json
+import re
+from typing import Any
+
+# ============================================================
+# Special Tokens
+# ============================================================
+
+bos_token: str = '<|begin▁of▁sentence|>'
+eos_token: str = '<|end▁of▁sentence|>'
+thinking_start_token: str = ''
+thinking_end_token: str = ''
+dsml_token: str = '|DSML|'
+
+USER_SP_TOKEN = '<|User|>'
+ASSISTANT_SP_TOKEN = '<|Assistant|>'
+LATEST_REMINDER_SP_TOKEN = '<|latest_reminder|>'
+
+# Task special tokens for internal classification tasks
+DS_TASK_SP_TOKENS = {
+ 'action': '<|action|>',
+ 'query': '<|query|>',
+ 'authority': '<|authority|>',
+ 'domain': '<|domain|>',
+ 'title': '<|title|>',
+ 'read_url': '<|read_url|>',
+}
+VALID_TASKS = set(DS_TASK_SP_TOKENS.keys())
+
+# ============================================================
+# Templates
+# ============================================================
+
+system_msg_template: str = '{content}'
+user_msg_template: str = '{content}'
+latest_reminder_msg_template: str = '{content}'
+assistant_msg_template: str = '{reasoning}{content}{tool_calls}' + eos_token
+assistant_msg_wo_eos_template: str = '{reasoning}{content}{tool_calls}'
+thinking_template: str = '{reasoning_content}'
+
+response_format_template: str = (
+ '## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}'
+)
+tool_call_template: str = (
+ "<{dsml_token}invoke name=\"{name}\">\n{arguments}\n{dsml_token}invoke>"
+)
+tool_calls_template = (
+ '<{dsml_token}{tc_block_name}>\n{tool_calls}\n{dsml_token}{tc_block_name}>'
+)
+tool_calls_block_name: str = 'tool_calls'
+
+tool_output_template: str = (
+ '{content}'
+)
+
+REASONING_EFFORT_MAX = (
+ 'Reasoning Effort: Absolute maximum with no shortcuts permitted.\n'
+ 'You MUST be very thorough in your thinking and comprehensively decompose the problem to resolve the '
+ 'root cause, rigorously stress-testing your logic against all potential paths, edge cases, and adversarial '
+ 'scenarios.\n'
+ 'Explicitly write out your entire deliberation process, documenting every intermediate step, considered '
+ 'alternative, and rejected hypothesis to ensure absolutely no assumption is left unchecked.\n\n'
+)
+
+TOOLS_TEMPLATE = (
+ '## Tools\n\n'
+ "You have access to a set of tools to help answer the user's question. You can invoke tools by writing a "
+ '"<{dsml_token}tool_calls>" block like the following:\n\n'
+ '<{dsml_token}tool_calls>\n'
+ '<{dsml_token}invoke name="$TOOL_NAME">\n'
+ '<{dsml_token}parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE{dsml_token}parameter>\n'
+ '...\n'
+ '{dsml_token}invoke>\n'
+ '<{dsml_token}invoke name="$TOOL_NAME2">\n'
+ '...\n'
+ '{dsml_token}invoke>\n'
+ '{dsml_token}tool_calls>\n\n'
+ 'String parameters should be specified as is and set `string="true"`. For all other types (numbers, booleans, '
+ 'arrays, objects), pass the value in JSON format and set `string="false"`.\n\n'
+ 'If thinking_mode is enabled (triggered by {thinking_start_token}), you MUST output your complete reasoning '
+ 'inside {thinking_start_token}...{thinking_end_token} BEFORE any tool calls or final response.\n\n'
+ 'Otherwise, output directly after {thinking_end_token} with tool calls or final response.\n\n'
+ '### Available Tool Schemas\n\n'
+ '{tool_schemas}\n\n'
+ 'You MUST strictly follow the above defined tool name and parameter schemas to invoke tool calls.\n'
+)
+
+# ============================================================
+# Utility Functions
+# ============================================================
+
+def to_json(value: Any) -> str:
+ """Serialize a value to JSON string."""
+ try:
+ return json.dumps(value, ensure_ascii=False)
+ except Exception:
+ return json.dumps(value, ensure_ascii=True)
+
+
+def tools_from_openai_format(tools):
+ """Extract function definitions from OpenAI-format tool list."""
+ return [tool['function'] for tool in tools]
+
+
+def tool_calls_from_openai_format(tool_calls):
+ """Convert OpenAI-format tool calls to internal format."""
+ return [
+ {
+ 'name': tool_call['function']['name'],
+ 'arguments': tool_call['function']['arguments'],
+ }
+ for tool_call in tool_calls
+ ]
+
+
+def tool_calls_to_openai_format(tool_calls):
+ """Convert internal tool calls to OpenAI format."""
+ return [
+ {
+ 'type': 'function',
+ 'function': {
+ 'name': tool_call['name'],
+ 'arguments': tool_call['arguments'],
+ }
+ }
+ for tool_call in tool_calls
+ ]
+
+
+def encode_arguments_to_dsml(tool_call: dict[str, str]) -> str:
+ """Encode tool call arguments into DSML parameter format.
+
+ Args:
+ tool_call: Dict with "name" and "arguments" (JSON string) keys.
+
+ Returns:
+ DSML-formatted parameter string.
+ """
+ p_dsml_template = '<{dsml_token}parameter name="{key}" string="{is_str}">{value}{dsml_token}parameter>'
+ P_dsml_strs = []
+
+ try:
+ arguments = json.loads(tool_call['arguments'])
+ except Exception:
+ arguments = {'arguments': tool_call['arguments']}
+
+ for k, v in arguments.items():
+ p_dsml_str = p_dsml_template.format(
+ dsml_token=dsml_token,
+ key=k,
+ is_str='true' if isinstance(v, str) else 'false',
+ value=v if isinstance(v, str) else to_json(v),
+ )
+ P_dsml_strs.append(p_dsml_str)
+
+ return '\n'.join(P_dsml_strs)
+
+
+def decode_dsml_to_arguments(tool_name: str, tool_args: dict[str, tuple[str, str]]) -> dict[str, str]:
+ """Decode DSML parameters back to a tool call dict.
+
+ Args:
+ tool_name: Name of the tool.
+ tool_args: Dict mapping param_name -> (value, is_string_flag).
+
+ Returns:
+ Dict with "name" and "arguments" (JSON string) keys.
+ """
+ def _decode_value(key: str, value: str, string: str):
+ if string == 'true':
+ value = to_json(value)
+ return f'{to_json(key)}: {value}'
+
+ tool_args_json = '{' + ', '.join([_decode_value(k, v, string=is_str) for k, (v, is_str) in tool_args.items()]) + '}'
+ return dict(name=tool_name, arguments=tool_args_json)
+
+
+def render_tools(tools: list[dict[str, str | dict[str, Any]]]) -> str:
+ """Render tool schemas into the system prompt format.
+
+ Args:
+ tools: List of tool schema dicts (each with name, description, parameters).
+
+ Returns:
+ Formatted tools section string.
+ """
+ tools_json = [to_json(t) for t in tools]
+
+ return TOOLS_TEMPLATE.format(
+ tool_schemas='\n'.join(tools_json),
+ dsml_token=dsml_token,
+ thinking_start_token=thinking_start_token,
+ thinking_end_token=thinking_end_token,
+ )
+
+
+def find_last_user_index(messages: list[dict[str, Any]]) -> int:
+ """Find the index of the last user/developer message."""
+ last_user_index = -1
+ for idx in range(len(messages) - 1, -1, -1):
+ if messages[idx].get('role') in ['user', 'developer']:
+ last_user_index = idx
+ break
+ return last_user_index
+
+
+# ============================================================
+# Message Rendering
+# ============================================================
+
+def render_message(index: int,
+ messages: list[dict[str, Any]],
+ thinking_mode: str,
+ drop_thinking: bool = True,
+ reasoning_effort: str | None = None) -> str:
+ """Render a single message at the given index into its encoded string form.
+
+ This is the core function that converts each message in the conversation
+ into the DeepSeek-V4 format.
+
+ Args:
+ index: Index of the message to render.
+ messages: Full list of messages in the conversation.
+ thinking_mode: Either "chat" or "thinking".
+ drop_thinking: Whether to drop reasoning content from earlier turns.
+ reasoning_effort: Optional reasoning effort level ("max", "high", or None).
+
+ Returns:
+ Encoded string for this message.
+ """
+ assert 0 <= index < len(messages)
+ assert thinking_mode in ['chat', 'thinking'], f'Invalid thinking_mode `{thinking_mode}`'
+
+ prompt = ''
+ msg = messages[index]
+ last_user_idx = find_last_user_index(messages)
+
+ role = msg.get('role')
+ content = msg.get('content')
+ tools = msg.get('tools')
+ response_format = msg.get('response_format')
+ tool_calls = msg.get('tool_calls')
+ reasoning_content = msg.get('reasoning_content')
+ wo_eos = msg.get('wo_eos', False)
+
+ if tools:
+ tools = tools_from_openai_format(tools)
+ if tool_calls:
+ tool_calls = tool_calls_from_openai_format(tool_calls)
+
+ # Reasoning effort prefix (only at index 0 in thinking mode with max effort)
+ assert reasoning_effort in ['max', None, 'high'], f'Invalid reasoning effort: {reasoning_effort}'
+ if index == 0 and thinking_mode == 'thinking' and reasoning_effort == 'max':
+ prompt += REASONING_EFFORT_MAX
+
+ if role == 'system':
+ prompt += system_msg_template.format(content=content or '')
+ if tools:
+ prompt += '\n\n' + render_tools(tools)
+ if response_format:
+ prompt += '\n\n' + response_format_template.format(schema=to_json(response_format))
+
+ elif role == 'developer':
+ assert content, f'Invalid message for role `{role}`: {msg}'
+
+ content_developer = USER_SP_TOKEN
+ content_developer += content
+
+ if tools:
+ content_developer += '\n\n' + render_tools(tools)
+ if response_format:
+ content_developer += '\n\n' + response_format_template.format(schema=to_json(response_format))
+
+ prompt += user_msg_template.format(content=content_developer)
+
+ elif role == 'user':
+ prompt += USER_SP_TOKEN
+
+ # Handle content blocks (tool results mixed with text)
+ content_blocks = msg.get('content_blocks')
+ if content_blocks:
+ parts = []
+ for block in content_blocks:
+ block_type = block.get('type')
+ if block_type == 'text':
+ parts.append(block.get('text', ''))
+ elif block_type == 'tool_result':
+ tool_content = block.get('content', '')
+ if isinstance(tool_content, list):
+ text_parts = []
+ for b in tool_content:
+ if b.get('type') == 'text':
+ text_parts.append(b.get('text', ''))
+ else:
+ text_parts.append(f"[Unsupported {b.get('type')}]")
+ tool_content = '\n\n'.join(text_parts)
+ parts.append(tool_output_template.format(content=tool_content))
+ else:
+ parts.append(f'[Unsupported {block_type}]')
+ prompt += '\n\n'.join(parts)
+ else:
+ prompt += content or ''
+
+ elif role == 'latest_reminder':
+ prompt += LATEST_REMINDER_SP_TOKEN + latest_reminder_msg_template.format(content=content)
+
+ elif role == 'tool':
+ raise NotImplementedError(
+ 'deepseek_v4 merges tool messages into user; please preprocess with merge_tool_messages()')
+
+ elif role == 'assistant':
+ thinking_part = ''
+ tc_content = ''
+
+ if tool_calls:
+ tc_list = [
+ tool_call_template.format(
+ dsml_token=dsml_token,
+ name=tc.get('name'),
+ arguments=encode_arguments_to_dsml(tc)
+ )
+ for tc in tool_calls
+ ]
+ tc_content += '\n\n' + tool_calls_template.format(
+ dsml_token=dsml_token,
+ tool_calls='\n'.join(tc_list),
+ tc_block_name=tool_calls_block_name,
+ )
+
+ summary_content = content or ''
+ rc = reasoning_content or ''
+
+ # Check if previous message has a task - if so, this is a task output (no thinking)
+ prev_has_task = index - 1 >= 0 and messages[index - 1].get('task') is not None
+
+ if thinking_mode == 'thinking' and not prev_has_task:
+ if not drop_thinking or index > last_user_idx:
+ thinking_part = thinking_template.format(reasoning_content=rc) + thinking_end_token
+ else:
+ thinking_part = ''
+
+ if wo_eos:
+ prompt += assistant_msg_wo_eos_template.format(
+ reasoning=thinking_part,
+ content=summary_content,
+ tool_calls=tc_content,
+ )
+ else:
+ prompt += assistant_msg_template.format(
+ reasoning=thinking_part,
+ content=summary_content,
+ tool_calls=tc_content,
+ )
+ else:
+ raise NotImplementedError(f'Unknown role: {role}')
+
+ # Append transition tokens based on what follows
+ if index + 1 < len(messages) and messages[index + 1].get('role') not in ['assistant', 'latest_reminder']:
+ return prompt
+
+ task = messages[index].get('task')
+ if task is not None:
+ # Task special token for internal classification tasks
+ assert task in VALID_TASKS, f"Invalid task: '{task}'. Valid tasks are: {list(VALID_TASKS)}"
+ task_sp_token = DS_TASK_SP_TOKENS[task]
+
+ if task != 'action':
+ # Non-action tasks: append task sp token directly after the message
+ prompt += task_sp_token
+ else:
+ # Action task: append Assistant + thinking token + action sp token
+ prompt += ASSISTANT_SP_TOKEN
+ prompt += thinking_end_token if thinking_mode != 'thinking' else thinking_start_token
+ prompt += task_sp_token
+
+ elif messages[index].get('role') in ['user', 'developer']:
+ # Normal generation: append Assistant + thinking token
+ prompt += ASSISTANT_SP_TOKEN
+ if not drop_thinking and thinking_mode == 'thinking':
+ prompt += thinking_start_token
+ elif drop_thinking and thinking_mode == 'thinking' and index >= last_user_idx:
+ prompt += thinking_start_token
+ else:
+ prompt += thinking_end_token
+
+ return prompt
+
+
+# ============================================================
+# Preprocessing
+# ============================================================
+
+def merge_tool_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
+ """Merge tool messages into the preceding user message using content_blocks
+ format.
+
+ DeepSeek-V4 does not have a standalone "tool" role; instead, tool results
+ are encoded as blocks within user messages.
+
+ This function converts a standard OpenAI-format conversation (with separate
+ "tool" role messages) into V4 format where tool results are merged into
+ user messages.
+
+ Args:
+ messages: List of message dicts in OpenAI format.
+
+ Returns:
+ Processed message list with tool messages merged into user messages.
+ """
+ merged: list[dict[str, Any]] = []
+
+ for msg in messages:
+ msg = copy.deepcopy(msg)
+ role = msg.get('role')
+
+ if role == 'tool':
+ # Convert tool message to a user message with tool_result block
+ tool_block = {
+ 'type': 'tool_result',
+ 'tool_use_id': msg.get('tool_call_id', ''),
+ 'content': msg.get('content', ''),
+ }
+ # Merge into previous message if it's already a user (merged tool)
+ if merged and merged[-1].get('role') == 'user' and 'content_blocks' in merged[-1]:
+ merged[-1]['content_blocks'].append(tool_block)
+ else:
+ merged.append({
+ 'role': 'user',
+ 'content_blocks': [tool_block],
+ })
+ elif role == 'user':
+ text_block = {'type': 'text', 'text': msg.get('content', '')}
+ if (merged and merged[-1].get('role') == 'user' and 'content_blocks' in merged[-1]
+ and merged[-1].get('task') is None):
+ merged[-1]['content_blocks'].append(text_block)
+ else:
+ new_msg = {
+ 'role': 'user',
+ 'content': msg.get('content', ''),
+ 'content_blocks': [text_block],
+ }
+ # Preserve extra fields (task, wo_eos, mask, etc.)
+ for key in ('task', 'wo_eos', 'mask'):
+ if key in msg:
+ new_msg[key] = msg[key]
+ merged.append(new_msg)
+ else:
+ merged.append(msg)
+
+ return merged
+
+
+def sort_tool_results_by_call_order(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
+ """Sort tool_result blocks within user messages by the order of tool_calls
+ in the preceding assistant message.
+
+ Args:
+ messages: Preprocessed message list (after merge_tool_messages).
+
+ Returns:
+ Message list with sorted tool result blocks.
+ """
+ last_tool_call_order: dict[str, int] = {}
+
+ for msg in messages:
+ role = msg.get('role')
+ if role == 'assistant' and msg.get('tool_calls'):
+ last_tool_call_order = {}
+ for idx, tc in enumerate(msg['tool_calls']):
+ tc_id = tc.get('id') or tc.get('function', {}).get('id', '')
+ if tc_id:
+ last_tool_call_order[tc_id] = idx
+
+ elif role == 'user' and msg.get('content_blocks'):
+ tool_blocks = [b for b in msg['content_blocks'] if b.get('type') == 'tool_result']
+ if len(tool_blocks) > 1 and last_tool_call_order:
+ sorted_blocks = sorted(
+ tool_blocks,
+ key=lambda b: last_tool_call_order.get(b.get('tool_use_id', ''), 0)
+ )
+ sorted_idx = 0
+ new_blocks = []
+ for block in msg['content_blocks']:
+ if block.get('type') == 'tool_result':
+ new_blocks.append(sorted_blocks[sorted_idx])
+ sorted_idx += 1
+ else:
+ new_blocks.append(block)
+ msg['content_blocks'] = new_blocks
+
+ return messages
+
+
+# ============================================================
+# Main Encoding Function
+# ============================================================
+
+def encode_messages(
+ messages: list[dict[str, Any]],
+ thinking_mode: str,
+ context: list[dict[str, Any]] | None = None,
+ drop_thinking: bool = True,
+ add_default_bos_token: bool = True,
+ reasoning_effort: str | None = None,
+) -> str:
+ """Encode a list of messages into the DeepSeek-V4 prompt format.
+
+ This is the main entry point for encoding conversations. It handles:
+ - BOS token insertion
+ - Thinking mode with optional reasoning content dropping
+ - Tool message merging into user messages
+ - Multi-turn conversation context
+
+ Args:
+ messages: List of message dicts to encode.
+ thinking_mode: Either "chat" or "thinking".
+ context: Optional preceding context messages (already encoded prefix).
+ drop_thinking: If True, drop reasoning_content from earlier assistant turns
+ (only keep reasoning for messages after the last user message).
+ add_default_bos_token: Whether to prepend BOS token at conversation start.
+ reasoning_effort: Optional reasoning effort level ("max", "high", or None).
+
+ Returns:
+ The encoded prompt string.
+ """
+ context = context if context else []
+
+ # Preprocess: merge tool messages and sort tool results
+ messages = merge_tool_messages(messages)
+ messages = sort_tool_results_by_call_order(context + messages)[len(context):]
+ if context:
+ context = merge_tool_messages(context)
+ context = sort_tool_results_by_call_order(context)
+
+ full_messages = context + messages
+
+ prompt = bos_token if add_default_bos_token and len(context) == 0 else ''
+
+ # Resolve drop_thinking: if any message has tools defined, don't drop thinking
+ effective_drop_thinking = drop_thinking
+ if any(m.get('tools') for m in full_messages):
+ effective_drop_thinking = False
+
+ if thinking_mode == 'thinking' and effective_drop_thinking:
+ full_messages = _drop_thinking_messages(full_messages)
+ # After dropping, recalculate how many messages to render
+ # (context may have shrunk too)
+ num_to_render = len(full_messages) - len(_drop_thinking_messages(context))
+ context_len = len(full_messages) - num_to_render
+ else:
+ num_to_render = len(messages)
+ context_len = len(context)
+
+ for idx in range(num_to_render):
+ prompt += render_message(
+ idx + context_len,
+ full_messages,
+ thinking_mode=thinking_mode,
+ drop_thinking=effective_drop_thinking,
+ reasoning_effort=reasoning_effort,
+ )
+
+ return prompt
+
+
+def _drop_thinking_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
+ """Drop reasoning_content and non-essential messages before the last user
+ message.
+
+ Behavior:
+ - Messages with role in ["user", "system", "tool", "latest_reminder"] are always kept.
+ - Messages at or after the last user index are always kept.
+ - Assistant messages before the last user get reasoning_content removed.
+ - Developer messages before the last user are dropped entirely.
+ """
+ last_user_idx = find_last_user_index(messages)
+ result = []
+ keep_roles = {'user', 'system', 'tool', 'latest_reminder', 'direct_search_results'}
+
+ for idx, msg in enumerate(messages):
+ role = msg.get('role')
+ if role in keep_roles or idx >= last_user_idx:
+ result.append(msg)
+ elif role == 'assistant':
+ msg = copy.copy(msg)
+ msg.pop('reasoning_content', None)
+ result.append(msg)
+ # developer and other roles before last_user_idx are dropped
+
+ return result
+
+
+# ============================================================
+# Parsing (Decoding model output)
+# ============================================================
+
+def _read_until_stop(index: int, text: str, stop: list[str]) -> tuple[int, str, str | None]:
+ """Read text from index until one of the stop strings is found.
+
+ Returns:
+ Tuple of (new_index, content_before_stop, matched_stop_string_or_None).
+ """
+ min_pos = len(text)
+ matched_stop = None
+
+ for s in stop:
+ pos = text.find(s, index)
+ if pos != -1 and pos < min_pos:
+ min_pos = pos
+ matched_stop = s
+
+ if matched_stop:
+ content = text[index:min_pos]
+ return min_pos + len(matched_stop), content, matched_stop
+ else:
+ content = text[index:]
+ return len(text), content, None
+
+
+def parse_tool_calls(index: int, text: str) -> tuple[int, str | None, list[dict[str, str]]]:
+ """Parse DSML tool calls from text starting at the given index.
+
+ Args:
+ index: Starting position in text.
+ text: The full text to parse.
+
+ Returns:
+ Tuple of (new_index, last_stop_token, list_of_tool_call_dicts).
+ Each tool call dict has "name" and "arguments" keys.
+ """
+ tool_calls: list[dict[str, Any]] = []
+ stop_token = None
+ tool_calls_end_token = f'{dsml_token}{tool_calls_block_name}>'
+
+ while index < len(text):
+ index, _, stop_token = _read_until_stop(index, text, [f'<{dsml_token}invoke', tool_calls_end_token])
+ if _ != '>\n':
+ raise ValueError(f"Tool call format error: expected '>\\n' but got '{_}'")
+
+ if stop_token == tool_calls_end_token:
+ break
+
+ if stop_token is None:
+ raise ValueError('Missing special token in tool calls')
+
+ index, tool_name_content, stop_token = _read_until_stop(
+ index, text, [f'<{dsml_token}parameter', f'{dsml_token}invoke'])
+
+ p_tool_name = re.findall(r'^\s*name="(.*?)">\n$', tool_name_content, flags=re.DOTALL)
+ if len(p_tool_name) != 1:
+ raise ValueError(f"Tool name format error: '{tool_name_content}'")
+ tool_name = p_tool_name[0]
+
+ tool_args: dict[str, tuple[str, str]] = {}
+ while stop_token == f'<{dsml_token}parameter':
+ index, param_content, stop_token = _read_until_stop(index, text, [f'/{dsml_token}parameter'])
+
+ param_kv = re.findall(r'^ name="(.*?)" string="(true|false)">(.*?)<$', param_content, flags=re.DOTALL)
+ if len(param_kv) != 1:
+ raise ValueError(f"Parameter format error: '{param_content}'")
+ param_name, string, param_value = param_kv[0]
+
+ if param_name in tool_args:
+ raise ValueError(f"Duplicate parameter name: '{param_name}'")
+ tool_args[param_name] = (param_value, string)
+
+ index, content, stop_token = _read_until_stop(
+ index, text, [f'<{dsml_token}parameter', f'{dsml_token}invoke'])
+ if content != '>\n':
+ raise ValueError(f"Parameter format error: expected '>\\n' but got '{content}'")
+
+ tool_call = decode_dsml_to_arguments(tool_name=tool_name, tool_args=tool_args)
+ tool_calls.append(tool_call)
+
+ return index, stop_token, tool_calls
+
+
+def parse_message_from_completion_text(text: str, thinking_mode: str) -> dict[str, Any]:
+ """Parse a model completion text into a structured assistant message.
+
+ This function takes the raw text output from the model (a single assistant turn)
+ and extracts:
+ - reasoning_content (thinking block)
+ - content (summary/response)
+ - tool_calls (if any)
+
+ NOTE: This function is designed to parse only correctly formatted strings and
+ will raise ValueError for malformed output.
+
+ Args:
+ text: The raw completion text (including EOS token).
+ thinking_mode: Either "chat" or "thinking".
+
+ Returns:
+ Dict with keys: "role", "content", "reasoning_content", "tool_calls".
+ tool_calls are in OpenAI format.
+ """
+ summary_content, reasoning_content, tool_calls = '', '', []
+ index, stop_token = 0, None
+ tool_calls_start_token = f'\n\n<{dsml_token}{tool_calls_block_name}'
+
+ is_thinking = thinking_mode == 'thinking'
+ is_tool_calling = False
+
+ if is_thinking:
+ index, content_delta, stop_token = _read_until_stop(index, text, [thinking_end_token, tool_calls_start_token])
+ reasoning_content = content_delta
+ assert stop_token == thinking_end_token, 'Invalid thinking format: missing '
+
+ index, content_delta, stop_token = _read_until_stop(index, text, [eos_token, tool_calls_start_token])
+ summary_content = content_delta
+ if stop_token == tool_calls_start_token:
+ is_tool_calling = True
+ else:
+ assert stop_token == eos_token, 'Invalid format: missing EOS token'
+
+ if is_tool_calling:
+ index, stop_token, tool_calls = parse_tool_calls(index, text)
+
+ index, tool_ends_text, stop_token = _read_until_stop(index, text, [eos_token])
+ assert not tool_ends_text, 'Unexpected content after tool calls'
+
+ assert len(text) == index and stop_token in [eos_token, None], 'Unexpected content at end'
+
+ for sp_token in [bos_token, eos_token, thinking_start_token, thinking_end_token, dsml_token]:
+ assert sp_token not in summary_content and sp_token not in reasoning_content, \
+ f"Unexpected special token '{sp_token}' in content"
+
+ return {
+ 'role': 'assistant',
+ 'content': summary_content,
+ 'reasoning_content': reasoning_content,
+ 'tool_calls': tool_calls_to_openai_format(tool_calls)
+ }
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index d2394fec4c..d5e59b3fc1 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -1,6 +1,7 @@
# Copyright (c) OpenMMLab. All rights reserved.
import dataclasses
import json
+import os
import uuid
from typing import Literal
@@ -642,6 +643,142 @@ def match(cls, model_path: str, **kwargs) -> str | None:
return 'deepseek-vl2'
+@MODELS.register_module(name=['deepseek-v4'])
+class DeepseekV4ChatTemplate(BaseChatTemplate):
+ """Chat template of DeepSeek-V4 models."""
+
+ def __init__(self, eoa='<|end▁of▁sentence|>', stop_words=['<|end▁of▁sentence|>'], **kwargs):
+ super().__init__(eoa=eoa, stop_words=stop_words, **kwargs)
+
+ def get_prompt(self, prompt, sequence_start=True, **kwargs):
+ messages = [{'role': 'user', 'content': prompt}]
+ return self.messages2prompt(messages, sequence_start, **kwargs)
+
+ def messages2prompt(self, messages, sequence_start=True, **kwargs):
+ from lmdeploy.deepseek_v4_encoding import encode_messages
+
+ if isinstance(messages, str):
+ messages = [{'role': 'user', 'content': messages}]
+
+ tools = self._normalize_tools(kwargs.pop('tools', None))
+ messages = self._with_tools(messages, tools) if tools else list(messages)
+
+ reasoning_effort = kwargs.pop('reasoning_effort', None)
+ if reasoning_effort not in ('high', 'max'):
+ reasoning_effort = None
+
+ thinking = kwargs.pop('thinking', False)
+ enable_thinking = kwargs.pop('enable_thinking', False)
+ thinking = thinking or enable_thinking
+
+ drop_thinking = kwargs.pop('drop_thinking', True)
+ return encode_messages(messages,
+ thinking_mode='thinking' if thinking else 'chat',
+ drop_thinking=drop_thinking,
+ add_default_bos_token=sequence_start,
+ reasoning_effort=reasoning_effort)
+
+ @staticmethod
+ def _normalize_tools(tools):
+ if not tools:
+ return None
+
+ normalized = []
+ for tool in tools:
+ if hasattr(tool, 'model_dump'):
+ tool = tool.model_dump()
+ if not isinstance(tool, dict):
+ continue
+ if 'function' in tool:
+ normalized.append(tool)
+ else:
+ normalized.append({'type': 'function', 'function': tool})
+ return normalized or None
+
+ @staticmethod
+ def _with_tools(messages, tools):
+ messages = [dict(message) for message in messages]
+ for message in messages:
+ if message.get('role') in ('system', 'developer'):
+ message['tools'] = tools
+ return messages
+ return [{'role': 'system', 'content': '', 'tools': tools}] + messages
+
+ @classmethod
+ def match(cls, model_path: str, trust_remote_code: bool = False, **kwargs) -> str | None:
+ try:
+ arch, cfg = get_model_arch(model_path, trust_remote_code=trust_remote_code)
+ cfg_dict = cfg.to_dict()
+ except Exception:
+ cfg_dict = {}
+ config_path = os.path.join(model_path, 'config.json')
+ if os.path.exists(config_path):
+ try:
+ with open(config_path, encoding='utf-8') as f:
+ cfg_dict = json.load(f)
+ except Exception:
+ cfg_dict = {}
+ arch = (cfg_dict.get('architectures') or [None])[0]
+
+ if arch == 'DeepseekV4ForCausalLM' or cfg_dict.get('model_type') == 'deepseek_v4':
+ return 'deepseek-v4'
+ return None
+
+
+@MODELS.register_module(name=['deepseek-v32', 'deepseek-v3.2'])
+class DeepseekV32ChatTemplate(BaseChatTemplate):
+ """Chat template of DeepSeek-V3.2 models."""
+
+ def __init__(self, eoa='<|end▁of▁sentence|>', stop_words=['<|end▁of▁sentence|>'], **kwargs):
+ super().__init__(eoa=eoa, stop_words=stop_words, **kwargs)
+
+ def get_prompt(self, prompt, sequence_start=True, **kwargs):
+ messages = [{'role': 'user', 'content': prompt}]
+ return self.messages2prompt(messages, sequence_start, **kwargs)
+
+ def messages2prompt(self, messages, sequence_start=True, **kwargs):
+ from lmdeploy.deepseek_v32_encoding import encode_messages
+
+ if isinstance(messages, str):
+ messages = [{'role': 'user', 'content': messages}]
+
+ tools = DeepseekV4ChatTemplate._normalize_tools(kwargs.pop('tools', None))
+ messages = DeepseekV4ChatTemplate._with_tools(messages, tools) if tools else list(messages)
+
+ thinking = kwargs.pop('thinking', False)
+ enable_thinking = kwargs.pop('enable_thinking', False)
+ thinking = thinking or enable_thinking
+
+ drop_thinking = kwargs.pop('drop_thinking', None)
+ if drop_thinking is None:
+ drop_thinking = bool(messages and messages[-1].get('role') == 'user')
+
+ return encode_messages(messages,
+ thinking_mode='thinking' if thinking else 'chat',
+ drop_thinking=drop_thinking,
+ add_default_bos_token=sequence_start)
+
+ @classmethod
+ def match(cls, model_path: str, trust_remote_code: bool = False, **kwargs) -> str | None:
+ try:
+ arch, cfg = get_model_arch(model_path, trust_remote_code=trust_remote_code)
+ cfg_dict = cfg.to_dict()
+ except Exception:
+ cfg_dict = {}
+ config_path = os.path.join(model_path, 'config.json')
+ if os.path.exists(config_path):
+ try:
+ with open(config_path, encoding='utf-8') as f:
+ cfg_dict = json.load(f)
+ except Exception:
+ cfg_dict = {}
+ arch = (cfg_dict.get('architectures') or [None])[0]
+
+ if arch == 'DeepseekV32ForCausalLM' or cfg_dict.get('model_type') == 'deepseek_v32':
+ return 'deepseek-v32'
+ return None
+
+
@MODELS.register_module(name=['llava-chatml'])
class ChatmlDirect(BaseChatTemplate):
diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py
index 68d4c51f85..44bfeacc45 100644
--- a/lmdeploy/serve/core/async_engine.py
+++ b/lmdeploy/serve/core/async_engine.py
@@ -481,7 +481,7 @@ async def generate(
session_id: int | Session,
gen_config: GenerationConfig | None = None,
tools: list[object] | None = None,
- reasoning_effort: Literal['low', 'medium', 'high'] | None = None,
+ reasoning_effort: Literal['low', 'medium', 'high', 'max'] | None = None,
stream_response: bool = True,
sequence_start: bool = True,
sequence_end: bool = True, # no interactive mode by default
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index 675cbf5103..72eb841cf5 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -172,7 +172,7 @@ class ChatCompletionRequest(BaseModel):
presence_penalty: float | None = 0.0
frequency_penalty: float | None = 0.0
user: str | None = None
- reasoning_effort: Literal['low', 'medium', 'high'] | None = None
+ reasoning_effort: Literal['low', 'medium', 'high', 'max'] | None = None
response_format: ResponseFormat | None = Field(default=None, examples=[None])
# additional argument of lmdeploy
do_preprocess: bool | None = True
diff --git a/lmdeploy/serve/parsers/reasoning_parser/__init__.py b/lmdeploy/serve/parsers/reasoning_parser/__init__.py
index 1f29020d7f..7b08f76525 100644
--- a/lmdeploy/serve/parsers/reasoning_parser/__init__.py
+++ b/lmdeploy/serve/parsers/reasoning_parser/__init__.py
@@ -1,5 +1,7 @@
# Copyright (c) OpenMMLab. All rights reserved.
from .deepseek_v3_reasoning_parser import DeepSeekV3ReasoningParser
+from .deepseek_v4_reasoning_parser import DeepSeekV4ReasoningParser
+from .deepseek_v32_reasoning_parser import DeepSeekV32ReasoningParser
from .reasoning_parser import LEGACY_REASONING_PARSER_NAMES, ReasoningParser, ReasoningParserManager
__all__ = [
@@ -7,4 +9,6 @@
'ReasoningParser',
'ReasoningParserManager',
'DeepSeekV3ReasoningParser',
+ 'DeepSeekV32ReasoningParser',
+ 'DeepSeekV4ReasoningParser',
]
diff --git a/lmdeploy/serve/parsers/reasoning_parser/deepseek_v32_reasoning_parser.py b/lmdeploy/serve/parsers/reasoning_parser/deepseek_v32_reasoning_parser.py
new file mode 100644
index 0000000000..b00f74ed34
--- /dev/null
+++ b/lmdeploy/serve/parsers/reasoning_parser/deepseek_v32_reasoning_parser.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .reasoning_parser import ReasoningParser, ReasoningParserManager
+
+
+@ReasoningParserManager.register_module(['deepseek-v32', 'deepseek-v3.2'])
+class DeepSeekV32ReasoningParser(ReasoningParser):
+ """Reasoning parser for DeepSeek-V3.2 thinking mode."""
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.thinking = kwargs.get('thinking', None)
+ self.enable_thinking = kwargs.get('enable_thinking', None)
+
+ def starts_in_reasoning_mode(self) -> bool:
+ return self.thinking is True or self.enable_thinking is True
diff --git a/lmdeploy/serve/parsers/reasoning_parser/deepseek_v4_reasoning_parser.py b/lmdeploy/serve/parsers/reasoning_parser/deepseek_v4_reasoning_parser.py
new file mode 100644
index 0000000000..a6ed06c1f8
--- /dev/null
+++ b/lmdeploy/serve/parsers/reasoning_parser/deepseek_v4_reasoning_parser.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .reasoning_parser import ReasoningParser, ReasoningParserManager
+
+
+@ReasoningParserManager.register_module('deepseek-v4')
+class DeepSeekV4ReasoningParser(ReasoningParser):
+ """Reasoning parser for DeepSeek-V4 thinking mode."""
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.thinking = kwargs.get('thinking', None)
+ self.enable_thinking = kwargs.get('enable_thinking', None)
+
+ def starts_in_reasoning_mode(self) -> bool:
+ return self.thinking is True or self.enable_thinking is True
diff --git a/lmdeploy/serve/parsers/response_parser.py b/lmdeploy/serve/parsers/response_parser.py
index dff1bc4b91..f7bce1aba7 100644
--- a/lmdeploy/serve/parsers/response_parser.py
+++ b/lmdeploy/serve/parsers/response_parser.py
@@ -274,6 +274,8 @@ def chat_template_kwargs_from_request(cls, request: ChatCompletionRequest) -> di
else:
logger.warning(
'`enable_thinking` in `chat_template_kwargs` will override the value in request.')
+ if request.reasoning_effort in ('high', 'max'):
+ chat_template_kwargs.setdefault('reasoning_effort', request.reasoning_effort)
return chat_template_kwargs
def __init__(self, request: ChatCompletionRequest):
@@ -281,6 +283,8 @@ def __init__(self, request: ChatCompletionRequest):
tcls = type(self).tool_parser_cls
self._kwargs = type(self).chat_template_kwargs_from_request(request)
self.enable_thinking: bool | None = self._kwargs.get('enable_thinking', None)
+ if self._kwargs.get('thinking') is True:
+ self.enable_thinking = True
self.reasoning_parser: ReasoningParser | None = rcls(**self._kwargs) if rcls else None
self.tool_parser: ToolParser | None = tcls() if tcls else None
if self.tool_parser is not None:
@@ -672,8 +676,11 @@ def parse_complete(
close_idx = n
tool_payload = text[open_idx + len(open_tag):].strip()
parsed_call = self.tool_parser.parse_tool_call_complete(tool_payload) if self.tool_parser else None
- if parsed_call is not None:
- tool_calls.append(parsed_call)
+ if parsed_call:
+ if isinstance(parsed_call, list):
+ tool_calls.extend(parsed_call)
+ else:
+ tool_calls.append(parsed_call)
pos = close_idx + len(close_tag) if close_tag else n
else:
# Tool call parsing failed — fall back to plain text.
diff --git a/lmdeploy/serve/parsers/tool_parser/__init__.py b/lmdeploy/serve/parsers/tool_parser/__init__.py
index f5c547ac1e..f9c5ceaba7 100644
--- a/lmdeploy/serve/parsers/tool_parser/__init__.py
+++ b/lmdeploy/serve/parsers/tool_parser/__init__.py
@@ -1,4 +1,6 @@
# Copyright (c) OpenMMLab. All rights reserved.
+from .deepseek_v4_tool_parser import DeepSeekV4ToolParser
+from .deepseek_v32_tool_parser import DeepSeekV32ToolParser
from .glm47_tool_parser import Glm47ToolParser
from .internlm2_tool_parser import Internlm2ToolParser
from .interns2preview_tool_parser import InternS2PreviewToolParser
@@ -13,6 +15,8 @@
'ToolParser',
'ToolParserManager',
'XmlToolParser',
+ 'DeepSeekV32ToolParser',
+ 'DeepSeekV4ToolParser',
'Glm47ToolParser',
'Internlm2ToolParser',
'Llama3JsonToolParser',
diff --git a/lmdeploy/serve/parsers/tool_parser/deepseek_v32_tool_parser.py b/lmdeploy/serve/parsers/tool_parser/deepseek_v32_tool_parser.py
new file mode 100644
index 0000000000..62f7f03a1d
--- /dev/null
+++ b/lmdeploy/serve/parsers/tool_parser/deepseek_v32_tool_parser.py
@@ -0,0 +1,107 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from __future__ import annotations
+
+import shortuuid
+
+from lmdeploy.deepseek_v32_encoding import dsml_token, parse_tool_calls
+from lmdeploy.serve.openai.protocol import (
+ DeltaFunctionCall,
+ DeltaToolCall,
+ FunctionCall,
+ ToolCall,
+)
+
+from .tool_parser import ToolParser, ToolParserManager
+
+TOOL_CALLS_BLOCK_NAME = 'function_calls'
+
+
+@ToolParserManager.register_module(['deepseek-v32', 'deepseek-v3.2'])
+class DeepSeekV32ToolParser(ToolParser):
+ """Tool parser for DeepSeek-V3.2 DSML function-call blocks."""
+
+ dsml_token = dsml_token
+ tool_calls_block_name = TOOL_CALLS_BLOCK_NAME
+ parse_tool_calls_func = staticmethod(parse_tool_calls)
+
+ @classmethod
+ def get_tool_open_tag(cls) -> str | None:
+ return f'\n\n<{cls.dsml_token}{cls.tool_calls_block_name}>'
+
+ @classmethod
+ def get_tool_close_tag(cls) -> str | None:
+ return f'{cls.dsml_token}{cls.tool_calls_block_name}>'
+
+ @classmethod
+ def get_tool_payload_format(cls) -> str:
+ return 'dsml'
+
+ def decode_tool_incremental(self, added_text: str, *, final: bool) -> list[DeltaToolCall]:
+ self._tool_payload += added_text
+ if not final:
+ return []
+
+ tool_calls = self.parse_tool_call_complete(self._tool_payload)
+ if not tool_calls:
+ return []
+
+ out: list[DeltaToolCall] = []
+ for offset, tool_call in enumerate(tool_calls):
+ index = self._active_tool_index + offset
+ out.append(
+ DeltaToolCall(
+ id=f'chatcmpl-tool-{shortuuid.random()}',
+ index=index,
+ type='function',
+ function=DeltaFunctionCall(name=tool_call.function.name),
+ ))
+ out.append(
+ DeltaToolCall(
+ id=None,
+ index=index,
+ type=None,
+ function=DeltaFunctionCall(arguments=tool_call.function.arguments),
+ ))
+
+ self._active_tool_index += len(tool_calls) - 1
+ return out
+
+ def parse_tool_call_complete(self, payload: str) -> list[ToolCall] | None:
+ payload = payload.strip()
+ if not payload:
+ return None
+
+ wrapped = f'{self.get_tool_open_tag()}\n{payload}\n{self.get_tool_close_tag()}'
+ start = len(self.get_tool_open_tag()) - 1
+ try:
+ _, stop_token, raw_tool_calls = self.parse_tool_calls_func(start, wrapped)
+ except Exception:
+ return None
+ if stop_token != self.get_tool_close_tag() or not raw_tool_calls:
+ return None
+
+ return [
+ ToolCall(function=FunctionCall(name=tool_call['name'], arguments=tool_call['arguments']))
+ for tool_call in raw_tool_calls
+ ]
+
+ def validate_complete(self, text: str) -> bool:
+ open_tag = self.get_tool_open_tag()
+ close_tag = self.get_tool_close_tag()
+
+ pos = 0
+ while True:
+ open_idx = text.find(open_tag, pos)
+ close_idx = text.find(close_tag, pos)
+ if open_idx < 0:
+ return close_idx < 0
+
+ payload_start = open_idx + len(open_tag)
+ if close_idx < payload_start:
+ return False
+ if self.parse_tool_call_complete(text[payload_start:close_idx]) is None:
+ return False
+
+ pos = close_idx + len(close_tag)
+ if pos >= len(text):
+ return True
diff --git a/lmdeploy/serve/parsers/tool_parser/deepseek_v4_tool_parser.py b/lmdeploy/serve/parsers/tool_parser/deepseek_v4_tool_parser.py
new file mode 100644
index 0000000000..9ee633b741
--- /dev/null
+++ b/lmdeploy/serve/parsers/tool_parser/deepseek_v4_tool_parser.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from __future__ import annotations
+
+from lmdeploy.deepseek_v4_encoding import dsml_token, parse_tool_calls, tool_calls_block_name
+
+from .deepseek_v32_tool_parser import DeepSeekV32ToolParser
+from .tool_parser import ToolParserManager
+
+
+@ToolParserManager.register_module(['deepseek-v4'])
+class DeepSeekV4ToolParser(DeepSeekV32ToolParser):
+ """Tool parser for DeepSeek-V4 DSML tool-call blocks."""
+
+ dsml_token = dsml_token
+ tool_calls_block_name = tool_calls_block_name
+ parse_tool_calls_func = staticmethod(parse_tool_calls)
diff --git a/lmdeploy/serve/parsers/tool_parser/tool_parser.py b/lmdeploy/serve/parsers/tool_parser/tool_parser.py
index f95b662a12..33b19cbd38 100644
--- a/lmdeploy/serve/parsers/tool_parser/tool_parser.py
+++ b/lmdeploy/serve/parsers/tool_parser/tool_parser.py
@@ -73,7 +73,7 @@ def decode_tool_incremental(self, added_text: str, *, final: bool) -> list[Delta
"""Decode incremental tool payload emitted between tool tags."""
raise NotImplementedError('ToolParser.decode_tool_incremental has not been implemented!')
- def parse_tool_call_complete(self, payload: str) -> ToolCall | None:
+ def parse_tool_call_complete(self, payload: str) -> ToolCall | list[ToolCall] | None:
"""Parse one complete tool payload into OpenAI tool call object."""
raise NotImplementedError('ToolParser.parse_tool_call_complete has not been implemented!')
diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py
index cf2452935e..4917031787 100644
--- a/lmdeploy/serve/processors/multimodal.py
+++ b/lmdeploy/serve/processors/multimodal.py
@@ -186,7 +186,7 @@ async def get_prompt_input(self,
sequence_start: bool,
adapter_name: str,
tools: list[object] | None = None,
- reasoning_effort: Literal['low', 'medium', 'high'] | None = None,
+ reasoning_effort: Literal['low', 'medium', 'high', 'max'] | None = None,
chat_template_kwargs: dict | None = None,
media_io_kwargs: dict[str, Any] | None = None,
mm_processor_kwargs: dict[str, Any] | None = None,
@@ -346,7 +346,7 @@ async def _get_text_prompt_input(self,
sequence_start: bool,
adapter_name: str,
tools: list[object] | None = None,
- reasoning_effort: Literal['low', 'medium', 'high'] | None = None,
+ reasoning_effort: Literal['low', 'medium', 'high', 'max'] | None = None,
chat_template_kwargs: dict | None = None,
**kwargs):
"""Process text-only prompt and return prompt string and input_ids."""
diff --git a/tests/test_lmdeploy/test_deepseek_v32_encoding.py b/tests/test_lmdeploy/test_deepseek_v32_encoding.py
new file mode 100644
index 0000000000..64e6723ea7
--- /dev/null
+++ b/tests/test_lmdeploy/test_deepseek_v32_encoding.py
@@ -0,0 +1,254 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+
+from lmdeploy.deepseek_v32_encoding import (
+ bos_token,
+ encode_messages,
+ eos_token,
+ parse_message_from_completion_text,
+)
+from lmdeploy.model import MODELS, DeepseekV32ChatTemplate, get_chat_template
+from lmdeploy.serve.openai.protocol import ChatCompletionRequest
+from lmdeploy.serve.parsers import ResponseParserManager
+from lmdeploy.serve.parsers.reasoning_parser import ReasoningParserManager
+from lmdeploy.serve.parsers.tool_parser import ToolParserManager
+
+WEATHER_TOOL = {
+ 'type': 'function',
+ 'function': {
+ 'name': 'get_weather',
+ 'description': 'Get weather for a city.',
+ 'parameters': {
+ 'type': 'object',
+ 'properties': {
+ 'city': {
+ 'type': 'string'
+ }
+ },
+ 'required': ['city'],
+ },
+ },
+}
+
+
+def test_deepseek_v32_minimal_chat_and_thinking_modes():
+ messages = [{'role': 'user', 'content': 'Hello'}]
+
+ assert encode_messages(messages, thinking_mode='chat') == (
+ f'{bos_token}<|User|>Hello<|Assistant|>'
+ )
+ assert encode_messages(messages, thinking_mode='thinking') == (
+ f'{bos_token}<|User|>Hello<|Assistant|>'
+ )
+
+
+def test_deepseek_v32_uses_function_call_block():
+ messages = [
+ {
+ 'role': 'system',
+ 'content': 'You may call tools.',
+ 'tools': [WEATHER_TOOL],
+ },
+ {
+ 'role': 'user',
+ 'content': 'Weather in Paris?',
+ },
+ {
+ 'role': 'assistant',
+ 'reasoning_content': 'I should call the weather tool.',
+ 'tool_calls': [{
+ 'type': 'function',
+ 'function': {
+ 'name': 'get_weather',
+ 'arguments': '{"city": "Paris"}',
+ },
+ }],
+ },
+ ]
+
+ prompt = encode_messages(messages, thinking_mode='thinking', drop_thinking=False)
+
+ assert '## Tools' in prompt
+ assert '"name": "get_weather"' in prompt
+ assert '<|DSML|function_calls>' in prompt
+ assert '|DSML|function_calls>' in prompt
+ assert '<|DSML|tool_calls>' not in prompt
+ assert '<|DSML|parameter name="city" string="true">Paris' in prompt
+
+
+def test_deepseek_v32_tool_results_reopen_thinking():
+ messages = [
+ {
+ 'role': 'user',
+ 'content': 'Weather in Paris?',
+ },
+ {
+ 'role': 'assistant',
+ 'tool_calls': [{
+ 'id': 'call_1',
+ 'type': 'function',
+ 'function': {
+ 'name': 'get_weather',
+ 'arguments': '{"city": "Paris"}',
+ },
+ }],
+ },
+ {
+ 'role': 'tool',
+ 'tool_call_id': 'call_1',
+ 'content': 'Sunny',
+ },
+ ]
+
+ prompt = encode_messages(messages, thinking_mode='thinking', drop_thinking=False)
+
+ assert '\nSunny\n\n\n' in prompt
+
+
+def test_deepseek_v32_parse_completion_text():
+ completion = (
+ 'I should call a tool.\n\n'
+ '<|DSML|function_calls>\n'
+ '<|DSML|invoke name="get_weather">\n'
+ '<|DSML|parameter name="city" string="true">Paris|DSML|parameter>\n'
+ '|DSML|invoke>\n'
+ '|DSML|function_calls>'
+ f'{eos_token}'
+ )
+
+ parsed = parse_message_from_completion_text(completion, thinking_mode='thinking')
+
+ assert parsed['reasoning_content'] == 'I should call a tool.'
+ assert parsed['content'] == ''
+ assert parsed['tool_calls'][0]['function']['name'] == 'get_weather'
+ assert json.loads(parsed['tool_calls'][0]['function']['arguments']) == {'city': 'Paris'}
+
+
+def test_deepseek_v32_chat_template_uses_vllm_thinking_switches():
+ model = MODELS.get('deepseek-v32')()
+ assert model.messages2prompt([{'role': 'user', 'content': 'Hello'}]) == (
+ f'{bos_token}<|User|>Hello<|Assistant|>'
+ )
+ assert model.messages2prompt([{'role': 'user', 'content': 'Hello'}], thinking=True) == (
+ f'{bos_token}<|User|>Hello<|Assistant|>'
+ )
+ assert model.messages2prompt([{'role': 'user', 'content': 'Hello'}], enable_thinking=True) == (
+ f'{bos_token}<|User|>Hello<|Assistant|>'
+ )
+
+
+def test_deepseek_v32_chat_template_normalizes_lmdeploy_tools_and_dict_arguments():
+ model = MODELS.get('deepseek-v32')()
+ prompt = model.messages2prompt(
+ [
+ {'role': 'user', 'content': 'List files'},
+ {
+ 'role': 'assistant',
+ 'tool_calls': [
+ {
+ 'type': 'function',
+ 'function': {
+ 'name': 'str_replace_editor',
+ 'arguments': {
+ 'command': 'view',
+ 'path': '/testbed',
+ },
+ },
+ }
+ ],
+ },
+ ],
+ tools=[
+ {
+ 'name': 'str_replace_editor',
+ 'description': 'Edit files',
+ 'parameters': {
+ 'type': 'object',
+ 'properties': {
+ 'command': {
+ 'type': 'string'
+ },
+ 'path': {
+ 'type': 'string'
+ },
+ },
+ 'required': ['command', 'path'],
+ },
+ }
+ ],
+ enable_thinking=True,
+ drop_thinking=False,
+ )
+ assert '## Tools' in prompt
+ assert '<|DSML|function_calls>' in prompt
+ assert '<|DSML|tool_calls>' not in prompt
+ assert '"name": "str_replace_editor"' in prompt
+ assert '<|DSML|parameter name="command" string="true">view' in prompt
+ assert '<|DSML|parameter name="path" string="true">/testbed' in prompt
+ assert 'parameter name="arguments"' not in prompt
+
+
+def test_deepseek_v32_chat_template_match_minimal_config(tmp_path):
+ (tmp_path / 'config.json').write_text(
+ json.dumps({
+ 'model_type': 'deepseek_v32',
+ 'architectures': ['DeepseekV32ForCausalLM'],
+ }),
+ encoding='utf-8',
+ )
+ assert DeepseekV32ChatTemplate.match(str(tmp_path)) == 'deepseek-v32'
+ assert isinstance(get_chat_template(str(tmp_path)), DeepseekV32ChatTemplate)
+
+
+def _make_response_parser(thinking=True):
+ cls = ResponseParserManager.get('default')
+ cls.reasoning_parser_cls = ReasoningParserManager.get('deepseek-v32')
+ cls.tool_parser_cls = ToolParserManager.get('deepseek-v32')
+ request = ChatCompletionRequest(
+ model='deepseek-ai/DeepSeek-V3.2',
+ messages=[],
+ stream=True,
+ chat_template_kwargs={'thinking': thinking},
+ )
+ return cls(request=request)
+
+
+def test_deepseek_v32_response_parser_complete_dsml_function_calls():
+ completion = (
+ 'I should call a tool.\n\n'
+ '<|DSML|function_calls>\n'
+ '<|DSML|invoke name="get_weather">\n'
+ '<|DSML|parameter name="city" string="true">Paris|DSML|parameter>\n'
+ '|DSML|invoke>\n'
+ '|DSML|function_calls>'
+ )
+
+ parser = _make_response_parser(thinking=True)
+ content, tool_calls, reasoning_content = parser.parse_complete(completion)
+ assert content is None
+ assert reasoning_content == 'I should call a tool.'
+ assert tool_calls is not None
+ assert len(tool_calls) == 1
+ assert tool_calls[0].function.name == 'get_weather'
+ assert json.loads(tool_calls[0].function.arguments) == {'city': 'Paris'}
+ assert parser.validate_complete(completion)
+
+
+def test_deepseek_v32_response_parser_streaming_dsml_function_calls():
+ text = (
+ 'need data\n\n'
+ '<|DSML|function_calls>\n'
+ '<|DSML|invoke name="search">\n'
+ '<|DSML|parameter name="query" string="true">DeepSeek V3.2|DSML|parameter>\n'
+ '|DSML|invoke>\n'
+ '|DSML|function_calls>'
+ )
+ parser = _make_response_parser(thinking=True)
+
+ deltas = parser.stream_chunk(delta_text=text, delta_token_ids=[])
+ reasoning = ''.join(delta.reasoning_content or '' for delta, _ in deltas)
+ tool_deltas = [tool_call for delta, _ in deltas for tool_call in (delta.tool_calls or [])]
+
+ assert reasoning == 'need data'
+ assert tool_deltas[0].function.name == 'search'
+ assert json.loads(tool_deltas[1].function.arguments) == {'query': 'DeepSeek V3.2'}
diff --git a/tests/test_lmdeploy/test_deepseek_v4_encoding.py b/tests/test_lmdeploy/test_deepseek_v4_encoding.py
new file mode 100644
index 0000000000..5acdad967f
--- /dev/null
+++ b/tests/test_lmdeploy/test_deepseek_v4_encoding.py
@@ -0,0 +1,274 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+
+from lmdeploy.deepseek_v4_encoding import (
+ REASONING_EFFORT_MAX,
+ bos_token,
+ encode_messages,
+ eos_token,
+ parse_message_from_completion_text,
+)
+from lmdeploy.model import MODELS, DeepseekV4ChatTemplate, get_chat_template
+from lmdeploy.serve.openai.protocol import ChatCompletionRequest
+from lmdeploy.serve.parsers import ResponseParserManager
+from lmdeploy.serve.parsers.reasoning_parser import ReasoningParserManager
+from lmdeploy.serve.parsers.tool_parser import ToolParserManager
+
+WEATHER_TOOL = {
+ 'type': 'function',
+ 'function': {
+ 'name': 'get_weather',
+ 'description': 'Get weather for a city.',
+ 'parameters': {
+ 'type': 'object',
+ 'properties': {
+ 'city': {
+ 'type': 'string'
+ }
+ },
+ 'required': ['city'],
+ },
+ },
+}
+
+
+def test_deepseek_v4_minimal_chat_and_thinking_modes():
+ messages = [{'role': 'user', 'content': 'Hello'}]
+
+ assert encode_messages(messages, thinking_mode='chat') == (
+ f'{bos_token}<|User|>Hello<|Assistant|>'
+ )
+ assert encode_messages(messages, thinking_mode='thinking') == (
+ f'{bos_token}<|User|>Hello<|Assistant|>'
+ )
+
+
+def test_deepseek_v4_uses_v4_tool_call_block():
+ messages = [
+ {
+ 'role': 'system',
+ 'content': 'You may call tools.',
+ 'tools': [WEATHER_TOOL],
+ },
+ {
+ 'role': 'user',
+ 'content': 'Weather in Paris?',
+ },
+ {
+ 'role': 'assistant',
+ 'reasoning_content': 'I should call the weather tool.',
+ 'tool_calls': [{
+ 'type': 'function',
+ 'function': {
+ 'name': 'get_weather',
+ 'arguments': '{"city": "Paris"}',
+ },
+ }],
+ },
+ ]
+
+ prompt = encode_messages(messages, thinking_mode='thinking', drop_thinking=False)
+
+ assert '## Tools' in prompt
+ assert '"name": "get_weather"' in prompt
+ assert '<|DSML|tool_calls>' in prompt
+ assert '|DSML|tool_calls>' in prompt
+ assert '<|DSML|function_calls>' not in prompt
+ assert '<|DSML|parameter name="city" string="true">Paris' in prompt
+
+
+def test_deepseek_v4_merges_tool_results_into_user_blocks():
+ messages = [
+ {
+ 'role': 'user',
+ 'content': 'Weather in Paris?',
+ },
+ {
+ 'role': 'assistant',
+ 'tool_calls': [{
+ 'id': 'call_1',
+ 'type': 'function',
+ 'function': {
+ 'name': 'get_weather',
+ 'arguments': '{"city": "Paris"}',
+ },
+ }],
+ },
+ {
+ 'role': 'tool',
+ 'tool_call_id': 'call_1',
+ 'content': 'Sunny',
+ },
+ ]
+
+ prompt = encode_messages(messages, thinking_mode='chat')
+
+ assert 'Sunny' in prompt
+ assert prompt.index('<|DSML|tool_calls>') < prompt.index('Sunny')
+
+
+def test_deepseek_v4_task_and_latest_reminder_rendering():
+ prompt = encode_messages(
+ [
+ {
+ 'role': 'latest_reminder',
+ 'content': 'Be terse.',
+ },
+ {
+ 'role': 'user',
+ 'content': 'Classify this page.',
+ 'task': 'domain',
+ },
+ ],
+ thinking_mode='chat',
+ )
+
+ assert '<|latest_reminder|>Be terse.' in prompt
+ assert '<|User|>Classify this page.<|domain|>' in prompt
+ assert '<|Assistant|>' not in prompt
+
+
+def test_deepseek_v4_parse_completion_text():
+ completion = (
+ 'I should call a tool.\n\n'
+ '<|DSML|tool_calls>\n'
+ '<|DSML|invoke name="get_weather">\n'
+ '<|DSML|parameter name="city" string="true">Paris|DSML|parameter>\n'
+ '|DSML|invoke>\n'
+ '|DSML|tool_calls>'
+ f'{eos_token}'
+ )
+
+ parsed = parse_message_from_completion_text(completion, thinking_mode='thinking')
+
+ assert parsed['reasoning_content'] == 'I should call a tool.'
+ assert parsed['content'] == ''
+ assert parsed['tool_calls'][0]['function']['name'] == 'get_weather'
+ assert json.loads(parsed['tool_calls'][0]['function']['arguments']) == {'city': 'Paris'}
+
+
+def test_deepseek_v4_chat_template_normalizes_lmdeploy_tools():
+ model = MODELS.get('deepseek-v4')()
+ prompt = model.messages2prompt(
+ [
+ {'role': 'system', 'content': 'You are a helpful assistant.'},
+ {'role': 'user', 'content': "What's the weather in Beijing?"},
+ ],
+ tools=[
+ {
+ 'name': 'get_weather',
+ 'description': 'Get weather for a location.',
+ 'parameters': {
+ 'type': 'object',
+ 'properties': {
+ 'location': {
+ 'type': 'string'
+ }
+ },
+ 'required': ['location'],
+ },
+ }
+ ],
+ enable_thinking=True,
+ reasoning_effort='max',
+ )
+ assert prompt.startswith(bos_token + REASONING_EFFORT_MAX)
+ assert '## Tools' in prompt
+ assert '"name": "get_weather"' in prompt
+ assert prompt.endswith('<|Assistant|>')
+
+
+def test_deepseek_v4_reasoning_effort_does_not_enable_thinking():
+ model = MODELS.get('deepseek-v4')()
+ prompt = model.messages2prompt(
+ [{'role': 'user', 'content': 'Hello'}],
+ reasoning_effort='max',
+ )
+ assert REASONING_EFFORT_MAX not in prompt
+ assert prompt == f'{bos_token}<|User|>Hello<|Assistant|>'
+
+
+def test_deepseek_v4_chat_template_match_minimal_config(tmp_path):
+ (tmp_path / 'config.json').write_text(
+ json.dumps({
+ 'model_type': 'deepseek_v4',
+ 'architectures': ['DeepseekV4ForCausalLM'],
+ }),
+ encoding='utf-8',
+ )
+ assert DeepseekV4ChatTemplate.match(str(tmp_path)) == 'deepseek-v4'
+ assert isinstance(get_chat_template(str(tmp_path)), DeepseekV4ChatTemplate)
+
+
+def _make_response_parser(thinking=True):
+ cls = ResponseParserManager.get('default')
+ cls.reasoning_parser_cls = ReasoningParserManager.get('deepseek-v4')
+ cls.tool_parser_cls = ToolParserManager.get('deepseek-v4')
+ request = ChatCompletionRequest(
+ model='deepseek-ai/DeepSeek-V4',
+ messages=[],
+ stream=True,
+ chat_template_kwargs={'thinking': thinking},
+ )
+ return cls(request=request)
+
+
+def test_deepseek_v4_response_parser_complete_dsml_tool_call():
+ completion = (
+ 'I should call a tool.\n\n'
+ '<|DSML|tool_calls>\n'
+ '<|DSML|invoke name="get_weather">\n'
+ '<|DSML|parameter name="city" string="true">Paris|DSML|parameter>\n'
+ '|DSML|invoke>\n'
+ '|DSML|tool_calls>'
+ )
+
+ parser = _make_response_parser(thinking=True)
+ content, tool_calls, reasoning_content = parser.parse_complete(completion)
+ assert content is None
+ assert reasoning_content == 'I should call a tool.'
+ assert tool_calls is not None
+ assert len(tool_calls) == 1
+ assert tool_calls[0].function.name == 'get_weather'
+ assert json.loads(tool_calls[0].function.arguments) == {'city': 'Paris'}
+ assert parser.validate_complete(completion)
+
+
+def test_deepseek_v4_response_parser_streaming_dsml_tool_call():
+ text = (
+ 'need a tool\n\n'
+ '<|DSML|tool_calls>\n'
+ '<|DSML|invoke name="search">\n'
+ '<|DSML|parameter name="query" string="true">DeepSeek V4|DSML|parameter>\n'
+ '|DSML|invoke>\n'
+ '|DSML|tool_calls>'
+ )
+ parser = _make_response_parser(thinking=True)
+
+ deltas = parser.stream_chunk(delta_text=text, delta_token_ids=[])
+ reasoning = ''.join(delta.reasoning_content or '' for delta, _ in deltas)
+ tool_deltas = [tool_call for delta, _ in deltas for tool_call in (delta.tool_calls or [])]
+
+ assert reasoning == 'need a tool'
+ assert tool_deltas[0].function.name == 'search'
+ assert json.loads(tool_deltas[1].function.arguments) == {'query': 'DeepSeek V4'}
+
+
+def test_deepseek_v4_response_parser_reasoning_effort_does_not_enable_thinking():
+ cls = ResponseParserManager.get('default')
+ cls.reasoning_parser_cls = ReasoningParserManager.get('deepseek-v4')
+ cls.tool_parser_cls = None
+ request = ChatCompletionRequest(
+ model='deepseek-ai/DeepSeek-V4',
+ messages=[],
+ stream=True,
+ reasoning_effort='max',
+ )
+ parser = cls(request=request)
+
+ deltas = parser.stream_chunk(delta_text='hello', delta_token_ids=[])
+ assert len(deltas) == 1
+ delta, tool_emitted = deltas[0]
+ assert tool_emitted is False
+ assert delta.content == 'hello'
+ assert delta.reasoning_content is None