diff --git a/lmdeploy/deepseek_v32_encoding.py b/lmdeploy/deepseek_v32_encoding.py
new file mode 100644
index 0000000000..afeac6e34b
--- /dev/null
+++ b/lmdeploy/deepseek_v32_encoding.py
@@ -0,0 +1,394 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapted from deepseek-ai/DeepSeek-V3.2 encoding/encoding_dsv32.py.
+import copy
+import json
+import re
+from typing import Any
+
+TOOLS_SYSTEM_TEMPLATE = (
+    '## Tools\n\n'
+    "You have access to a set of tools you can use to answer the user's question.\n"
+    'You can invoke functions by writing a "<{dsml_token}function_calls>" block like the following as part of '
+    'your reply to the user:\n'
+    '<{dsml_token}function_calls>\n'
+    '<{dsml_token}invoke name="$FUNCTION_NAME">\n'
+    '<{dsml_token}parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE</{dsml_token}parameter>\n'
+    '...\n'
+    '</{dsml_token}invoke>\n'
+    '<{dsml_token}invoke name="$FUNCTION_NAME2">\n'
+    '...\n'
+    '</{dsml_token}invoke>\n'
+    '</{dsml_token}function_calls>\n\n'
+    'String and scalar parameters should be specified as is without any escaping or quotes, while lists and objects '
+    'should use JSON format. The "string" attribute should be set to "true" for string type parameters and "false" '
+    'for other types (numbers, booleans, arrays, objects).\n\n'
+    'If the thinking_mode is enabled, then after function results you should strongly consider outputting a thinking '
+    'block. Here is an example:\n\n'
+    '<{dsml_token}function_calls>\n'
+    '...\n'
+    '</{dsml_token}function_calls>\n\n'
+    '<function_results>\n'
+    '...\n'
+    '</function_results>\n\n'
+    '{thinking_start_token}...thinking about results{thinking_end_token}\n\n'
+    'Here are the functions available in JSONSchema format:\n'
+    '<functions>\n'
+    '{tool_schemas}\n'
+    '</functions>\n'
+)
+
+bos_token: str = '<｜begin▁of▁sentence｜>'
+eos_token: str = '<｜end▁of▁sentence｜>'
+thinking_start_token: str = '<think>'
+thinking_end_token: str = '</think>'
+dsml_token: str = '｜DSML｜'
+system_msg_template: str = '{content}'
+user_msg_template: str = '<｜User｜>{content}<｜Assistant｜>'
+assistant_msg_template: str = '{reasoning}{content}{tool_calls}<｜end▁of▁sentence｜>'
+thinking_template = '{reasoning_content}'
+
+response_format_template: str = (
+    '## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}'
+)
+tool_call_template: str = (
+    "<{dsml_token}invoke name=\"{name}\">\n{arguments}\n</{dsml_token}invoke>"
+)
+tool_calls_template = (
+    '<{dsml_token}function_calls>\n{tool_calls}\n</{dsml_token}function_calls>'
+)
+
+tool_output_template: str = (
+    '\n<result>{content}</result>'
+)
+
+def to_json(value: Any) -> str:
+    try:
+        return json.dumps(value, ensure_ascii=False)
+    except Exception:
+        return json.dumps(value, ensure_ascii=True)
+
+def tools_from_openai_format(tools):
+    return [tool['function'] for tool in tools]
+
+def tool_calls_from_openai_format(tool_calls):
+    return [
+        {
+            'name': tool_call['function']['name'],
+            'arguments': tool_call['function']['arguments'],
+        }
+        for tool_call in tool_calls
+    ]
+
+def tool_calls_to_openai_format(tool_calls):
+    return [
+        {
+            'type': 'function',
+            'function': {
+                'name': tool_call['name'],
+                'arguments': tool_call['arguments'],
+            }
+        }
+        for tool_call in tool_calls
+    ]
+
+def encode_arguments_to_dsml(tool_call: dict[str, str]) -> str:
+    p_dsml_template = """<{dsml_token}parameter name="{key}" string="{is_str}">{value}</{dsml_token}parameter>"""
+    P_dsml_strs = []
+
+    raw_arguments = tool_call['arguments']
+    arguments = json.loads(raw_arguments) if isinstance(raw_arguments, str) else raw_arguments
+    if not isinstance(arguments, dict):
+        raise ValueError('Assistant tool call function.arguments must be a JSON object.')
+
+    for k, v in arguments.items():
+        p_dsml_str = p_dsml_template.format(
+            dsml_token=dsml_token,
+            key=k,
+            is_str='true' if isinstance(v, str) else 'false',
+            value=v if isinstance(v, str) else to_json(v),
+        )
+
+        P_dsml_strs.append(p_dsml_str)
+
+    return '\n'.join(P_dsml_strs)
+
+
+def decode_dsml_to_arguments(tool_name: str, tool_args: dict[str, tuple[str, str]]) -> dict[str, str]:
+    def _decode_value(key: str, value: str, string: str):
+        if string == 'true':
+            value = to_json(value)
+        return f'{to_json(key)}: {value}'
+
+    tool_args_json = '{' + ', '.join([_decode_value(k, v, string=is_str) for k, (v, is_str) in tool_args.items()]) + '}'
+    return dict(name=tool_name, arguments=tool_args_json)
+
+def render_tools(tools: list[dict[str, str | dict[str, Any]]]) -> str:
+    tools_json = [to_json(t) for t in tools]
+
+    return TOOLS_SYSTEM_TEMPLATE.format(
+        tool_schemas='\n'.join(tools_json),
+        dsml_token=dsml_token,
+        thinking_start_token=thinking_start_token,
+        thinking_end_token=thinking_end_token,
+    )
+
+def find_last_user_index(messages: list[dict[str, Any]]) -> int:
+    last_user_index = -1
+    for idx in range(len(messages)-1, -1, -1):
+        if messages[idx].get('role') in ['user', 'developer']:
+            last_user_index = idx
+            break
+    return last_user_index
+
+def render_message(index: int, messages: list[dict[str, Any]], thinking_mode: str) -> str:
+    assert 0 <= index < len(messages)
+    assert thinking_mode in ['chat', 'thinking'], f'Invalid thinking_mode `{thinking_mode}`'
+
+    prompt = ''
+    msg = messages[index]
+    last_user_idx = find_last_user_index(messages)
+
+    role = msg.get('role')
+    content = msg.get('content')
+    tools = msg.get('tools')
+    response_format = msg.get('response_format')
+    tool_calls = msg.get('tool_calls')
+    reasoning_content = msg.get('reasoning_content')
+
+    if tools:
+        tools = tools_from_openai_format(tools)
+    if tool_calls:
+        tool_calls = tool_calls_from_openai_format(tool_calls)
+
+    if role == 'system':
+        prompt += system_msg_template.format(content=content or '')
+        if tools:
+            prompt += '\n\n' + render_tools(tools)
+
+        if response_format:
+            prompt += '\n\n' + response_format_template.format(schema=to_json(response_format))
+
+    elif role == 'developer':
+        assert content, f'Invalid message for role `{role}`: {msg}'
+        content_developer = ''
+        if tools:
+            content_developer += '\n\n' + render_tools(tools)
+
+        if response_format:
+            content_developer += '\n\n' + response_format_template.format(schema=to_json(response_format))
+
+        content_developer += f"\n\n# The user's message is: {content}"
+
+        prompt += user_msg_template.format(content=content_developer)
+        if index == last_user_idx and thinking_mode == 'thinking':
+            prompt += thinking_start_token
+        else:
+            prompt += thinking_end_token
+
+    elif role == 'user':
+        prompt += user_msg_template.format(content=content)
+
+        if index == last_user_idx and thinking_mode == 'thinking':
+            prompt += thinking_start_token
+        else:
+            prompt += thinking_end_token
+
+    elif role == 'tool':
+        prev_assistant_idx = index - 1
+        assistant_msg = messages[prev_assistant_idx]
+        while prev_assistant_idx >= 0 and assistant_msg.get('role') == 'tool':
+            prev_assistant_idx -= 1
+            assistant_msg = messages[prev_assistant_idx]
+
+        assert (
+            index == 0 or prev_assistant_idx >= 0 and assistant_msg.get('role') == 'assistant'
+        ), f'Invalid messages at {index}:\n{assistant_msg}'
+
+        tool_call_order = index - prev_assistant_idx
+        assistant_tool_calls = assistant_msg.get('tool_calls')
+        assert (
+            assistant_tool_calls and len(assistant_tool_calls) >= tool_call_order
+        ), 'No tool calls but found tool output'
+
+        if tool_call_order == 1:
+            prompt += '\n\n<function_results>'
+
+        prompt += tool_output_template.format(content=content)
+
+        if tool_call_order == len(assistant_tool_calls):
+            prompt += '\n</function_results>'
+
+            if index >= last_user_idx and thinking_mode == 'thinking':
+                prompt += '\n\n' + thinking_start_token
+            else:
+                prompt += '\n\n' + thinking_end_token
+
+    elif role == 'assistant':
+        prev_assistant_idx = index
+        thinking_part = ''
+
+        tool_calls_content = ''
+        if tool_calls:
+            tool_calls = [
+                tool_call_template.format(
+                    dsml_token=dsml_token,
+                    name=tool_call.get('name'),
+                    arguments=encode_arguments_to_dsml(tool_call)
+                )
+                for tool_call in tool_calls
+            ]
+            tool_calls_content += '\n\n' + tool_calls_template.format(
+                dsml_token=dsml_token,
+                tool_calls='\n'.join(tool_calls)
+            )
+
+        summary_content = content or ''
+
+        if thinking_mode == 'thinking' and index > last_user_idx:
+            assert reasoning_content or tool_calls, (
+                f'ThinkingMode: {thinking_mode}, invalid message without reasoning_content/tool_calls `{msg}` '
+                'after last user message')
+            thinking_part = thinking_template.format(reasoning_content=reasoning_content or '') + thinking_end_token
+
+        prompt += assistant_msg_template.format(
+            reasoning=thinking_part,
+            content=summary_content,
+            tool_calls=tool_calls_content,
+        )
+    else:
+        raise NotImplementedError(f'Unknown role: {role}')
+
+    return prompt
+
+def drop_thinking_messages(messages: list[dict[str, Any]], last_user_idx: int | None = None) -> list[dict[str, Any]]:
+    messages_wo_thinking: list[dict[str, Any]] = []
+    last_user_idx = find_last_user_index(messages) if last_user_idx is None else last_user_idx
+    for idx, msg in enumerate(messages):
+        role = msg.get('role')
+        if role in ['user', 'system', 'tool'] or idx >= last_user_idx:
+            messages_wo_thinking.append(msg)
+            continue
+
+        elif role == 'assistant':
+            msg_wo_thinking = copy.copy(msg)
+            msg_wo_thinking.pop('reasoning_content', None)
+            messages_wo_thinking.append(msg_wo_thinking)
+
+    return messages_wo_thinking
+
+def encode_messages(messages: list[dict[str, Any]],
+                    thinking_mode: str,
+                    context: list[dict[str, Any]] | None = None,
+                    drop_thinking: bool = True,
+                    add_default_bos_token: bool = True) -> str:
+    context = context if context else []
+    full_messages = context + messages
+
+    prompt = bos_token if add_default_bos_token and len(context) == 0 else ''
+
+    if thinking_mode == 'thinking' and drop_thinking:
+        full_messages = drop_thinking_messages(full_messages)
+
+    for idx in range(len(messages)):
+        prompt += render_message(idx + len(context), full_messages, thinking_mode=thinking_mode)
+
+    return prompt
+
+def _read_until_stop(index: int, text: str, stop: list[str]) -> tuple[int, str, str | None]:
+    min_pos = len(text)
+    matched_stop = None
+
+    for s in stop:
+        pos = text.find(s, index)
+        if pos != -1 and pos < min_pos:
+            min_pos = pos
+            matched_stop = s
+
+    if matched_stop:
+        content = text[index:min_pos]
+        return min_pos + len(matched_stop), content, matched_stop
+    else:
+        content = text[index:]
+        return len(text), content, None
+
+def parse_tool_calls(index: int, text: str):
+    tool_calls: list[dict[str, Any]] = []
+    stop_token = None
+    tool_calls_end_token = f'</{dsml_token}function_calls>'
+
+    while index < len(text):
+        index, _, stop_token = _read_until_stop(index, text, [f'<{dsml_token}invoke', tool_calls_end_token])
+        assert _ == '>\n', 'Tool call format error'
+
+        if stop_token == tool_calls_end_token:
+            break
+
+        assert stop_token is not None, 'Missing special token'
+
+        index, tool_name_content, stop_token = _read_until_stop(
+            index, text, [f'<{dsml_token}parameter', f'</{dsml_token}invoke'])
+
+        p_tool_name = re.findall(r'^\s*name="(.*?)">\n$', tool_name_content, flags=re.DOTALL)
+        assert len(p_tool_name) == 1, 'Tool name format error'
+        tool_name = p_tool_name[0]
+
+        tool_args: dict[str, tuple[str, str]] = {}
+        while stop_token == f'<{dsml_token}parameter':
+            index, param_content, stop_token = _read_until_stop(index, text, [f'/{dsml_token}parameter'])
+
+            param_kv = re.findall(r'^ name="(.*?)" string="(true|false)">(.*?)<$', param_content, flags=re.DOTALL)
+            assert len(param_kv) == 1, 'Parameter format error'
+            param_name, string, param_value = param_kv[0]
+
+            assert param_name not in tool_args, 'Duplicate parameter name'
+            tool_args[param_name] = (param_value, string)
+
+            index, content, stop_token = _read_until_stop(
+                index, text, [f'<{dsml_token}parameter', f'</{dsml_token}invoke'])
+            assert content == '>\n', 'Parameter format error'
+
+        tool_call = decode_dsml_to_arguments(tool_name=tool_name, tool_args=tool_args)
+        tool_calls.append(tool_call)
+
+    return index, stop_token, tool_calls
+
+# NOTE: This function parses only correctly formatted strings and will not attempt to correct
+# malformed output that may be generated by the model.
+def parse_message_from_completion_text(text: str, thinking_mode: str):
+    summary_content, reasoning_content, tool_calls = '', '', []
+    index, stop_token = 0, None
+    tool_calls_start_token = f'\n\n<{dsml_token}function_calls'
+
+    is_thinking, is_tool_calling = thinking_mode == 'thinking', False
+
+    if is_thinking:
+        index, content_delta, stop_token = _read_until_stop(index, text, [thinking_end_token, tool_calls_start_token])
+        reasoning_content = content_delta
+        assert stop_token == thinking_end_token, 'Invalid thinking format'
+
+    index, content_delta, stop_token = _read_until_stop(index, text, [eos_token, tool_calls_start_token])
+    summary_content = content_delta
+    if stop_token == tool_calls_start_token:
+        is_tool_calling = True
+    else:
+        assert stop_token == eos_token, 'Invalid summary format'
+
+    if is_tool_calling:
+        index, stop_token, tool_calls = parse_tool_calls(index, text)
+
+        index, tool_ends_text, stop_token = _read_until_stop(index, text, [eos_token])
+        assert not tool_ends_text, 'Unexpected content after tool calls'
+
+    assert len(text) == index and stop_token in [eos_token, None], 'Unexpected content at end'
+
+    for sp_token in [bos_token, eos_token, thinking_start_token, thinking_end_token, dsml_token]:
+        assert (
+            sp_token not in summary_content and sp_token not in reasoning_content
+        ), 'Unexpected special token in content'
+
+    return {
+        'role': 'assistant',
+        'content': summary_content,
+        'reasoning_content': reasoning_content,
+        'tool_calls': tool_calls_to_openai_format(tool_calls)
+    }
diff --git a/lmdeploy/deepseek_v4_encoding.py b/lmdeploy/deepseek_v4_encoding.py
new file mode 100644
index 0000000000..c24c4c5584
--- /dev/null
+++ b/lmdeploy/deepseek_v4_encoding.py
@@ -0,0 +1,743 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapted from deepseek-ai/DeepSeek-V4-Pro encoding/encoding_dsv4.py.
+"""DeepSeek-V4 Encoding.
+
+A self-contained implementation for encoding/decoding DeepSeek-V4 chat messages with tool calling, thinking mode, and
+quick instruction task support.
+"""
+
+import copy
+import json
+import re
+from typing import Any
+
+# ============================================================
+# Special Tokens
+# ============================================================
+
+bos_token: str = '<｜begin▁of▁sentence｜>'
+eos_token: str = '<｜end▁of▁sentence｜>'
+thinking_start_token: str = '<think>'
+thinking_end_token: str = '</think>'
+dsml_token: str = '｜DSML｜'
+
+USER_SP_TOKEN = '<｜User｜>'
+ASSISTANT_SP_TOKEN = '<｜Assistant｜>'
+LATEST_REMINDER_SP_TOKEN = '<｜latest_reminder｜>'
+
+# Task special tokens for internal classification tasks
+DS_TASK_SP_TOKENS = {
+    'action': '<｜action｜>',
+    'query': '<｜query｜>',
+    'authority': '<｜authority｜>',
+    'domain': '<｜domain｜>',
+    'title': '<｜title｜>',
+    'read_url': '<｜read_url｜>',
+}
+VALID_TASKS = set(DS_TASK_SP_TOKENS.keys())
+
+# ============================================================
+# Templates
+# ============================================================
+
+system_msg_template: str = '{content}'
+user_msg_template: str = '{content}'
+latest_reminder_msg_template: str = '{content}'
+assistant_msg_template: str = '{reasoning}{content}{tool_calls}' + eos_token
+assistant_msg_wo_eos_template: str = '{reasoning}{content}{tool_calls}'
+thinking_template: str = '{reasoning_content}'
+
+response_format_template: str = (
+    '## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}'
+)
+tool_call_template: str = (
+    "<{dsml_token}invoke name=\"{name}\">\n{arguments}\n</{dsml_token}invoke>"
+)
+tool_calls_template = (
+    '<{dsml_token}{tc_block_name}>\n{tool_calls}\n</{dsml_token}{tc_block_name}>'
+)
+tool_calls_block_name: str = 'tool_calls'
+
+tool_output_template: str = (
+    '<tool_result>{content}</tool_result>'
+)
+
+REASONING_EFFORT_MAX = (
+    'Reasoning Effort: Absolute maximum with no shortcuts permitted.\n'
+    'You MUST be very thorough in your thinking and comprehensively decompose the problem to resolve the '
+    'root cause, rigorously stress-testing your logic against all potential paths, edge cases, and adversarial '
+    'scenarios.\n'
+    'Explicitly write out your entire deliberation process, documenting every intermediate step, considered '
+    'alternative, and rejected hypothesis to ensure absolutely no assumption is left unchecked.\n\n'
+)
+
+TOOLS_TEMPLATE = (
+    '## Tools\n\n'
+    "You have access to a set of tools to help answer the user's question. You can invoke tools by writing a "
+    '"<{dsml_token}tool_calls>" block like the following:\n\n'
+    '<{dsml_token}tool_calls>\n'
+    '<{dsml_token}invoke name="$TOOL_NAME">\n'
+    '<{dsml_token}parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE</{dsml_token}parameter>\n'
+    '...\n'
+    '</{dsml_token}invoke>\n'
+    '<{dsml_token}invoke name="$TOOL_NAME2">\n'
+    '...\n'
+    '</{dsml_token}invoke>\n'
+    '</{dsml_token}tool_calls>\n\n'
+    'String parameters should be specified as is and set `string="true"`. For all other types (numbers, booleans, '
+    'arrays, objects), pass the value in JSON format and set `string="false"`.\n\n'
+    'If thinking_mode is enabled (triggered by {thinking_start_token}), you MUST output your complete reasoning '
+    'inside {thinking_start_token}...{thinking_end_token} BEFORE any tool calls or final response.\n\n'
+    'Otherwise, output directly after {thinking_end_token} with tool calls or final response.\n\n'
+    '### Available Tool Schemas\n\n'
+    '{tool_schemas}\n\n'
+    'You MUST strictly follow the above defined tool name and parameter schemas to invoke tool calls.\n'
+)
+
+# ============================================================
+# Utility Functions
+# ============================================================
+
+def to_json(value: Any) -> str:
+    """Serialize a value to JSON string."""
+    try:
+        return json.dumps(value, ensure_ascii=False)
+    except Exception:
+        return json.dumps(value, ensure_ascii=True)
+
+
+def tools_from_openai_format(tools):
+    """Extract function definitions from OpenAI-format tool list."""
+    return [tool['function'] for tool in tools]
+
+
+def tool_calls_from_openai_format(tool_calls):
+    """Convert OpenAI-format tool calls to internal format."""
+    return [
+        {
+            'name': tool_call['function']['name'],
+            'arguments': tool_call['function']['arguments'],
+        }
+        for tool_call in tool_calls
+    ]
+
+
+def tool_calls_to_openai_format(tool_calls):
+    """Convert internal tool calls to OpenAI format."""
+    return [
+        {
+            'type': 'function',
+            'function': {
+                'name': tool_call['name'],
+                'arguments': tool_call['arguments'],
+            }
+        }
+        for tool_call in tool_calls
+    ]
+
+
+def encode_arguments_to_dsml(tool_call: dict[str, str]) -> str:
+    """Encode tool call arguments into DSML parameter format.
+
+    Args:
+        tool_call: Dict with "name" and "arguments" (JSON string) keys.
+
+    Returns:
+        DSML-formatted parameter string.
+    """
+    p_dsml_template = '<{dsml_token}parameter name="{key}" string="{is_str}">{value}</{dsml_token}parameter>'
+    P_dsml_strs = []
+
+    try:
+        arguments = json.loads(tool_call['arguments'])
+    except Exception:
+        arguments = {'arguments': tool_call['arguments']}
+
+    for k, v in arguments.items():
+        p_dsml_str = p_dsml_template.format(
+            dsml_token=dsml_token,
+            key=k,
+            is_str='true' if isinstance(v, str) else 'false',
+            value=v if isinstance(v, str) else to_json(v),
+        )
+        P_dsml_strs.append(p_dsml_str)
+
+    return '\n'.join(P_dsml_strs)
+
+
+def decode_dsml_to_arguments(tool_name: str, tool_args: dict[str, tuple[str, str]]) -> dict[str, str]:
+    """Decode DSML parameters back to a tool call dict.
+
+    Args:
+        tool_name: Name of the tool.
+        tool_args: Dict mapping param_name -> (value, is_string_flag).
+
+    Returns:
+        Dict with "name" and "arguments" (JSON string) keys.
+    """
+    def _decode_value(key: str, value: str, string: str):
+        if string == 'true':
+            value = to_json(value)
+        return f'{to_json(key)}: {value}'
+
+    tool_args_json = '{' + ', '.join([_decode_value(k, v, string=is_str) for k, (v, is_str) in tool_args.items()]) + '}'
+    return dict(name=tool_name, arguments=tool_args_json)
+
+
+def render_tools(tools: list[dict[str, str | dict[str, Any]]]) -> str:
+    """Render tool schemas into the system prompt format.
+
+    Args:
+        tools: List of tool schema dicts (each with name, description, parameters).
+
+    Returns:
+        Formatted tools section string.
+    """
+    tools_json = [to_json(t) for t in tools]
+
+    return TOOLS_TEMPLATE.format(
+        tool_schemas='\n'.join(tools_json),
+        dsml_token=dsml_token,
+        thinking_start_token=thinking_start_token,
+        thinking_end_token=thinking_end_token,
+    )
+
+
+def find_last_user_index(messages: list[dict[str, Any]]) -> int:
+    """Find the index of the last user/developer message."""
+    last_user_index = -1
+    for idx in range(len(messages) - 1, -1, -1):
+        if messages[idx].get('role') in ['user', 'developer']:
+            last_user_index = idx
+            break
+    return last_user_index
+
+
+# ============================================================
+# Message Rendering
+# ============================================================
+
+def render_message(index: int,
+                   messages: list[dict[str, Any]],
+                   thinking_mode: str,
+                   drop_thinking: bool = True,
+                   reasoning_effort: str | None = None) -> str:
+    """Render a single message at the given index into its encoded string form.
+
+    This is the core function that converts each message in the conversation
+    into the DeepSeek-V4 format.
+
+    Args:
+        index: Index of the message to render.
+        messages: Full list of messages in the conversation.
+        thinking_mode: Either "chat" or "thinking".
+        drop_thinking: Whether to drop reasoning content from earlier turns.
+        reasoning_effort: Optional reasoning effort level ("max", "high", or None).
+
+    Returns:
+        Encoded string for this message.
+    """
+    assert 0 <= index < len(messages)
+    assert thinking_mode in ['chat', 'thinking'], f'Invalid thinking_mode `{thinking_mode}`'
+
+    prompt = ''
+    msg = messages[index]
+    last_user_idx = find_last_user_index(messages)
+
+    role = msg.get('role')
+    content = msg.get('content')
+    tools = msg.get('tools')
+    response_format = msg.get('response_format')
+    tool_calls = msg.get('tool_calls')
+    reasoning_content = msg.get('reasoning_content')
+    wo_eos = msg.get('wo_eos', False)
+
+    if tools:
+        tools = tools_from_openai_format(tools)
+    if tool_calls:
+        tool_calls = tool_calls_from_openai_format(tool_calls)
+
+    # Reasoning effort prefix (only at index 0 in thinking mode with max effort)
+    assert reasoning_effort in ['max', None, 'high'], f'Invalid reasoning effort: {reasoning_effort}'
+    if index == 0 and thinking_mode == 'thinking' and reasoning_effort == 'max':
+        prompt += REASONING_EFFORT_MAX
+
+    if role == 'system':
+        prompt += system_msg_template.format(content=content or '')
+        if tools:
+            prompt += '\n\n' + render_tools(tools)
+        if response_format:
+            prompt += '\n\n' + response_format_template.format(schema=to_json(response_format))
+
+    elif role == 'developer':
+        assert content, f'Invalid message for role `{role}`: {msg}'
+
+        content_developer = USER_SP_TOKEN
+        content_developer += content
+
+        if tools:
+            content_developer += '\n\n' + render_tools(tools)
+        if response_format:
+            content_developer += '\n\n' + response_format_template.format(schema=to_json(response_format))
+
+        prompt += user_msg_template.format(content=content_developer)
+
+    elif role == 'user':
+        prompt += USER_SP_TOKEN
+
+        # Handle content blocks (tool results mixed with text)
+        content_blocks = msg.get('content_blocks')
+        if content_blocks:
+            parts = []
+            for block in content_blocks:
+                block_type = block.get('type')
+                if block_type == 'text':
+                    parts.append(block.get('text', ''))
+                elif block_type == 'tool_result':
+                    tool_content = block.get('content', '')
+                    if isinstance(tool_content, list):
+                        text_parts = []
+                        for b in tool_content:
+                            if b.get('type') == 'text':
+                                text_parts.append(b.get('text', ''))
+                            else:
+                                text_parts.append(f"[Unsupported {b.get('type')}]")
+                        tool_content = '\n\n'.join(text_parts)
+                    parts.append(tool_output_template.format(content=tool_content))
+                else:
+                    parts.append(f'[Unsupported {block_type}]')
+            prompt += '\n\n'.join(parts)
+        else:
+            prompt += content or ''
+
+    elif role == 'latest_reminder':
+        prompt += LATEST_REMINDER_SP_TOKEN + latest_reminder_msg_template.format(content=content)
+
+    elif role == 'tool':
+        raise NotImplementedError(
+            'deepseek_v4 merges tool messages into user; please preprocess with merge_tool_messages()')
+
+    elif role == 'assistant':
+        thinking_part = ''
+        tc_content = ''
+
+        if tool_calls:
+            tc_list = [
+                tool_call_template.format(
+                    dsml_token=dsml_token,
+                    name=tc.get('name'),
+                    arguments=encode_arguments_to_dsml(tc)
+                )
+                for tc in tool_calls
+            ]
+            tc_content += '\n\n' + tool_calls_template.format(
+                dsml_token=dsml_token,
+                tool_calls='\n'.join(tc_list),
+                tc_block_name=tool_calls_block_name,
+            )
+
+        summary_content = content or ''
+        rc = reasoning_content or ''
+
+        # Check if previous message has a task - if so, this is a task output (no thinking)
+        prev_has_task = index - 1 >= 0 and messages[index - 1].get('task') is not None
+
+        if thinking_mode == 'thinking' and not prev_has_task:
+            if not drop_thinking or index > last_user_idx:
+                thinking_part = thinking_template.format(reasoning_content=rc) + thinking_end_token
+            else:
+                thinking_part = ''
+
+        if wo_eos:
+            prompt += assistant_msg_wo_eos_template.format(
+                reasoning=thinking_part,
+                content=summary_content,
+                tool_calls=tc_content,
+            )
+        else:
+            prompt += assistant_msg_template.format(
+                reasoning=thinking_part,
+                content=summary_content,
+                tool_calls=tc_content,
+            )
+    else:
+        raise NotImplementedError(f'Unknown role: {role}')
+
+    # Append transition tokens based on what follows
+    if index + 1 < len(messages) and messages[index + 1].get('role') not in ['assistant', 'latest_reminder']:
+        return prompt
+
+    task = messages[index].get('task')
+    if task is not None:
+        # Task special token for internal classification tasks
+        assert task in VALID_TASKS, f"Invalid task: '{task}'. Valid tasks are: {list(VALID_TASKS)}"
+        task_sp_token = DS_TASK_SP_TOKENS[task]
+
+        if task != 'action':
+            # Non-action tasks: append task sp token directly after the message
+            prompt += task_sp_token
+        else:
+            # Action task: append Assistant + thinking token + action sp token
+            prompt += ASSISTANT_SP_TOKEN
+            prompt += thinking_end_token if thinking_mode != 'thinking' else thinking_start_token
+            prompt += task_sp_token
+
+    elif messages[index].get('role') in ['user', 'developer']:
+        # Normal generation: append Assistant + thinking token
+        prompt += ASSISTANT_SP_TOKEN
+        if not drop_thinking and thinking_mode == 'thinking':
+            prompt += thinking_start_token
+        elif drop_thinking and thinking_mode == 'thinking' and index >= last_user_idx:
+            prompt += thinking_start_token
+        else:
+            prompt += thinking_end_token
+
+    return prompt
+
+
+# ============================================================
+# Preprocessing
+# ============================================================
+
+def merge_tool_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Merge tool messages into the preceding user message using content_blocks
+    format.
+
+    DeepSeek-V4 does not have a standalone "tool" role; instead, tool results
+    are encoded as <tool_result> blocks within user messages.
+
+    This function converts a standard OpenAI-format conversation (with separate
+    "tool" role messages) into V4 format where tool results are merged into
+    user messages.
+
+    Args:
+        messages: List of message dicts in OpenAI format.
+
+    Returns:
+        Processed message list with tool messages merged into user messages.
+    """
+    merged: list[dict[str, Any]] = []
+
+    for msg in messages:
+        msg = copy.deepcopy(msg)
+        role = msg.get('role')
+
+        if role == 'tool':
+            # Convert tool message to a user message with tool_result block
+            tool_block = {
+                'type': 'tool_result',
+                'tool_use_id': msg.get('tool_call_id', ''),
+                'content': msg.get('content', ''),
+            }
+            # Merge into previous message if it's already a user (merged tool)
+            if merged and merged[-1].get('role') == 'user' and 'content_blocks' in merged[-1]:
+                merged[-1]['content_blocks'].append(tool_block)
+            else:
+                merged.append({
+                    'role': 'user',
+                    'content_blocks': [tool_block],
+                })
+        elif role == 'user':
+            text_block = {'type': 'text', 'text': msg.get('content', '')}
+            if (merged and merged[-1].get('role') == 'user' and 'content_blocks' in merged[-1]
+                    and merged[-1].get('task') is None):
+                merged[-1]['content_blocks'].append(text_block)
+            else:
+                new_msg = {
+                    'role': 'user',
+                    'content': msg.get('content', ''),
+                    'content_blocks': [text_block],
+                }
+                # Preserve extra fields (task, wo_eos, mask, etc.)
+                for key in ('task', 'wo_eos', 'mask'):
+                    if key in msg:
+                        new_msg[key] = msg[key]
+                merged.append(new_msg)
+        else:
+            merged.append(msg)
+
+    return merged
+
+
+def sort_tool_results_by_call_order(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Sort tool_result blocks within user messages by the order of tool_calls
+    in the preceding assistant message.
+
+    Args:
+        messages: Preprocessed message list (after merge_tool_messages).
+
+    Returns:
+        Message list with sorted tool result blocks.
+    """
+    last_tool_call_order: dict[str, int] = {}
+
+    for msg in messages:
+        role = msg.get('role')
+        if role == 'assistant' and msg.get('tool_calls'):
+            last_tool_call_order = {}
+            for idx, tc in enumerate(msg['tool_calls']):
+                tc_id = tc.get('id') or tc.get('function', {}).get('id', '')
+                if tc_id:
+                    last_tool_call_order[tc_id] = idx
+
+        elif role == 'user' and msg.get('content_blocks'):
+            tool_blocks = [b for b in msg['content_blocks'] if b.get('type') == 'tool_result']
+            if len(tool_blocks) > 1 and last_tool_call_order:
+                sorted_blocks = sorted(
+                    tool_blocks,
+                    key=lambda b: last_tool_call_order.get(b.get('tool_use_id', ''), 0)
+                )
+                sorted_idx = 0
+                new_blocks = []
+                for block in msg['content_blocks']:
+                    if block.get('type') == 'tool_result':
+                        new_blocks.append(sorted_blocks[sorted_idx])
+                        sorted_idx += 1
+                    else:
+                        new_blocks.append(block)
+                msg['content_blocks'] = new_blocks
+
+    return messages
+
+
+# ============================================================
+# Main Encoding Function
+# ============================================================
+
+def encode_messages(
+    messages: list[dict[str, Any]],
+    thinking_mode: str,
+    context: list[dict[str, Any]] | None = None,
+    drop_thinking: bool = True,
+    add_default_bos_token: bool = True,
+    reasoning_effort: str | None = None,
+) -> str:
+    """Encode a list of messages into the DeepSeek-V4 prompt format.
+
+    This is the main entry point for encoding conversations. It handles:
+    - BOS token insertion
+    - Thinking mode with optional reasoning content dropping
+    - Tool message merging into user messages
+    - Multi-turn conversation context
+
+    Args:
+        messages: List of message dicts to encode.
+        thinking_mode: Either "chat" or "thinking".
+        context: Optional preceding context messages (already encoded prefix).
+        drop_thinking: If True, drop reasoning_content from earlier assistant turns
+                      (only keep reasoning for messages after the last user message).
+        add_default_bos_token: Whether to prepend BOS token at conversation start.
+        reasoning_effort: Optional reasoning effort level ("max", "high", or None).
+
+    Returns:
+        The encoded prompt string.
+    """
+    context = context if context else []
+
+    # Preprocess: merge tool messages and sort tool results
+    messages = merge_tool_messages(messages)
+    messages = sort_tool_results_by_call_order(context + messages)[len(context):]
+    if context:
+        context = merge_tool_messages(context)
+        context = sort_tool_results_by_call_order(context)
+
+    full_messages = context + messages
+
+    prompt = bos_token if add_default_bos_token and len(context) == 0 else ''
+
+    # Resolve drop_thinking: if any message has tools defined, don't drop thinking
+    effective_drop_thinking = drop_thinking
+    if any(m.get('tools') for m in full_messages):
+        effective_drop_thinking = False
+
+    if thinking_mode == 'thinking' and effective_drop_thinking:
+        full_messages = _drop_thinking_messages(full_messages)
+        # After dropping, recalculate how many messages to render
+        # (context may have shrunk too)
+        num_to_render = len(full_messages) - len(_drop_thinking_messages(context))
+        context_len = len(full_messages) - num_to_render
+    else:
+        num_to_render = len(messages)
+        context_len = len(context)
+
+    for idx in range(num_to_render):
+        prompt += render_message(
+            idx + context_len,
+            full_messages,
+            thinking_mode=thinking_mode,
+            drop_thinking=effective_drop_thinking,
+            reasoning_effort=reasoning_effort,
+        )
+
+    return prompt
+
+
+def _drop_thinking_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Drop reasoning_content and non-essential messages before the last user
+    message.
+
+    Behavior:
+    - Messages with role in ["user", "system", "tool", "latest_reminder"] are always kept.
+    - Messages at or after the last user index are always kept.
+    - Assistant messages before the last user get reasoning_content removed.
+    - Developer messages before the last user are dropped entirely.
+    """
+    last_user_idx = find_last_user_index(messages)
+    result = []
+    keep_roles = {'user', 'system', 'tool', 'latest_reminder', 'direct_search_results'}
+
+    for idx, msg in enumerate(messages):
+        role = msg.get('role')
+        if role in keep_roles or idx >= last_user_idx:
+            result.append(msg)
+        elif role == 'assistant':
+            msg = copy.copy(msg)
+            msg.pop('reasoning_content', None)
+            result.append(msg)
+        # developer and other roles before last_user_idx are dropped
+
+    return result
+
+
+# ============================================================
+# Parsing (Decoding model output)
+# ============================================================
+
+def _read_until_stop(index: int, text: str, stop: list[str]) -> tuple[int, str, str | None]:
+    """Read text from index until one of the stop strings is found.
+
+    Returns:
+        Tuple of (new_index, content_before_stop, matched_stop_string_or_None).
+    """
+    min_pos = len(text)
+    matched_stop = None
+
+    for s in stop:
+        pos = text.find(s, index)
+        if pos != -1 and pos < min_pos:
+            min_pos = pos
+            matched_stop = s
+
+    if matched_stop:
+        content = text[index:min_pos]
+        return min_pos + len(matched_stop), content, matched_stop
+    else:
+        content = text[index:]
+        return len(text), content, None
+
+
+def parse_tool_calls(index: int, text: str) -> tuple[int, str | None, list[dict[str, str]]]:
+    """Parse DSML tool calls from text starting at the given index.
+
+    Args:
+        index: Starting position in text.
+        text: The full text to parse.
+
+    Returns:
+        Tuple of (new_index, last_stop_token, list_of_tool_call_dicts).
+        Each tool call dict has "name" and "arguments" keys.
+    """
+    tool_calls: list[dict[str, Any]] = []
+    stop_token = None
+    tool_calls_end_token = f'</{dsml_token}{tool_calls_block_name}>'
+
+    while index < len(text):
+        index, _, stop_token = _read_until_stop(index, text, [f'<{dsml_token}invoke', tool_calls_end_token])
+        if _ != '>\n':
+            raise ValueError(f"Tool call format error: expected '>\\n' but got '{_}'")
+
+        if stop_token == tool_calls_end_token:
+            break
+
+        if stop_token is None:
+            raise ValueError('Missing special token in tool calls')
+
+        index, tool_name_content, stop_token = _read_until_stop(
+            index, text, [f'<{dsml_token}parameter', f'</{dsml_token}invoke'])
+
+        p_tool_name = re.findall(r'^\s*name="(.*?)">\n$', tool_name_content, flags=re.DOTALL)
+        if len(p_tool_name) != 1:
+            raise ValueError(f"Tool name format error: '{tool_name_content}'")
+        tool_name = p_tool_name[0]
+
+        tool_args: dict[str, tuple[str, str]] = {}
+        while stop_token == f'<{dsml_token}parameter':
+            index, param_content, stop_token = _read_until_stop(index, text, [f'/{dsml_token}parameter'])
+
+            param_kv = re.findall(r'^ name="(.*?)" string="(true|false)">(.*?)<$', param_content, flags=re.DOTALL)
+            if len(param_kv) != 1:
+                raise ValueError(f"Parameter format error: '{param_content}'")
+            param_name, string, param_value = param_kv[0]
+
+            if param_name in tool_args:
+                raise ValueError(f"Duplicate parameter name: '{param_name}'")
+            tool_args[param_name] = (param_value, string)
+
+            index, content, stop_token = _read_until_stop(
+                index, text, [f'<{dsml_token}parameter', f'</{dsml_token}invoke'])
+            if content != '>\n':
+                raise ValueError(f"Parameter format error: expected '>\\n' but got '{content}'")
+
+        tool_call = decode_dsml_to_arguments(tool_name=tool_name, tool_args=tool_args)
+        tool_calls.append(tool_call)
+
+    return index, stop_token, tool_calls
+
+
+def parse_message_from_completion_text(text: str, thinking_mode: str) -> dict[str, Any]:
+    """Parse a model completion text into a structured assistant message.
+
+    This function takes the raw text output from the model (a single assistant turn)
+    and extracts:
+    - reasoning_content (thinking block)
+    - content (summary/response)
+    - tool_calls (if any)
+
+    NOTE: This function is designed to parse only correctly formatted strings and
+    will raise ValueError for malformed output.
+
+    Args:
+        text: The raw completion text (including EOS token).
+        thinking_mode: Either "chat" or "thinking".
+
+    Returns:
+        Dict with keys: "role", "content", "reasoning_content", "tool_calls".
+        tool_calls are in OpenAI format.
+    """
+    summary_content, reasoning_content, tool_calls = '', '', []
+    index, stop_token = 0, None
+    tool_calls_start_token = f'\n\n<{dsml_token}{tool_calls_block_name}'
+
+    is_thinking = thinking_mode == 'thinking'
+    is_tool_calling = False
+
+    if is_thinking:
+        index, content_delta, stop_token = _read_until_stop(index, text, [thinking_end_token, tool_calls_start_token])
+        reasoning_content = content_delta
+        assert stop_token == thinking_end_token, 'Invalid thinking format: missing </think>'
+
+    index, content_delta, stop_token = _read_until_stop(index, text, [eos_token, tool_calls_start_token])
+    summary_content = content_delta
+    if stop_token == tool_calls_start_token:
+        is_tool_calling = True
+    else:
+        assert stop_token == eos_token, 'Invalid format: missing EOS token'
+
+    if is_tool_calling:
+        index, stop_token, tool_calls = parse_tool_calls(index, text)
+
+        index, tool_ends_text, stop_token = _read_until_stop(index, text, [eos_token])
+        assert not tool_ends_text, 'Unexpected content after tool calls'
+
+    assert len(text) == index and stop_token in [eos_token, None], 'Unexpected content at end'
+
+    for sp_token in [bos_token, eos_token, thinking_start_token, thinking_end_token, dsml_token]:
+        assert sp_token not in summary_content and sp_token not in reasoning_content, \
+            f"Unexpected special token '{sp_token}' in content"
+
+    return {
+        'role': 'assistant',
+        'content': summary_content,
+        'reasoning_content': reasoning_content,
+        'tool_calls': tool_calls_to_openai_format(tool_calls)
+    }
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index d2394fec4c..d5e59b3fc1 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import dataclasses
 import json
+import os
 import uuid
 from typing import Literal
 
@@ -642,6 +643,142 @@ def match(cls, model_path: str, **kwargs) -> str | None:
             return 'deepseek-vl2'
 
 
+@MODELS.register_module(name=['deepseek-v4'])
+class DeepseekV4ChatTemplate(BaseChatTemplate):
+    """Chat template of DeepSeek-V4 models."""
+
+    def __init__(self, eoa='<｜end▁of▁sentence｜>', stop_words=['<｜end▁of▁sentence｜>'], **kwargs):
+        super().__init__(eoa=eoa, stop_words=stop_words, **kwargs)
+
+    def get_prompt(self, prompt, sequence_start=True, **kwargs):
+        messages = [{'role': 'user', 'content': prompt}]
+        return self.messages2prompt(messages, sequence_start, **kwargs)
+
+    def messages2prompt(self, messages, sequence_start=True, **kwargs):
+        from lmdeploy.deepseek_v4_encoding import encode_messages
+
+        if isinstance(messages, str):
+            messages = [{'role': 'user', 'content': messages}]
+
+        tools = self._normalize_tools(kwargs.pop('tools', None))
+        messages = self._with_tools(messages, tools) if tools else list(messages)
+
+        reasoning_effort = kwargs.pop('reasoning_effort', None)
+        if reasoning_effort not in ('high', 'max'):
+            reasoning_effort = None
+
+        thinking = kwargs.pop('thinking', False)
+        enable_thinking = kwargs.pop('enable_thinking', False)
+        thinking = thinking or enable_thinking
+
+        drop_thinking = kwargs.pop('drop_thinking', True)
+        return encode_messages(messages,
+                               thinking_mode='thinking' if thinking else 'chat',
+                               drop_thinking=drop_thinking,
+                               add_default_bos_token=sequence_start,
+                               reasoning_effort=reasoning_effort)
+
+    @staticmethod
+    def _normalize_tools(tools):
+        if not tools:
+            return None
+
+        normalized = []
+        for tool in tools:
+            if hasattr(tool, 'model_dump'):
+                tool = tool.model_dump()
+            if not isinstance(tool, dict):
+                continue
+            if 'function' in tool:
+                normalized.append(tool)
+            else:
+                normalized.append({'type': 'function', 'function': tool})
+        return normalized or None
+
+    @staticmethod
+    def _with_tools(messages, tools):
+        messages = [dict(message) for message in messages]
+        for message in messages:
+            if message.get('role') in ('system', 'developer'):
+                message['tools'] = tools
+                return messages
+        return [{'role': 'system', 'content': '', 'tools': tools}] + messages
+
+    @classmethod
+    def match(cls, model_path: str, trust_remote_code: bool = False, **kwargs) -> str | None:
+        try:
+            arch, cfg = get_model_arch(model_path, trust_remote_code=trust_remote_code)
+            cfg_dict = cfg.to_dict()
+        except Exception:
+            cfg_dict = {}
+            config_path = os.path.join(model_path, 'config.json')
+            if os.path.exists(config_path):
+                try:
+                    with open(config_path, encoding='utf-8') as f:
+                        cfg_dict = json.load(f)
+                except Exception:
+                    cfg_dict = {}
+            arch = (cfg_dict.get('architectures') or [None])[0]
+
+        if arch == 'DeepseekV4ForCausalLM' or cfg_dict.get('model_type') == 'deepseek_v4':
+            return 'deepseek-v4'
+        return None
+
+
+@MODELS.register_module(name=['deepseek-v32', 'deepseek-v3.2'])
+class DeepseekV32ChatTemplate(BaseChatTemplate):
+    """Chat template of DeepSeek-V3.2 models."""
+
+    def __init__(self, eoa='<｜end▁of▁sentence｜>', stop_words=['<｜end▁of▁sentence｜>'], **kwargs):
+        super().__init__(eoa=eoa, stop_words=stop_words, **kwargs)
+
+    def get_prompt(self, prompt, sequence_start=True, **kwargs):
+        messages = [{'role': 'user', 'content': prompt}]
+        return self.messages2prompt(messages, sequence_start, **kwargs)
+
+    def messages2prompt(self, messages, sequence_start=True, **kwargs):
+        from lmdeploy.deepseek_v32_encoding import encode_messages
+
+        if isinstance(messages, str):
+            messages = [{'role': 'user', 'content': messages}]
+
+        tools = DeepseekV4ChatTemplate._normalize_tools(kwargs.pop('tools', None))
+        messages = DeepseekV4ChatTemplate._with_tools(messages, tools) if tools else list(messages)
+
+        thinking = kwargs.pop('thinking', False)
+        enable_thinking = kwargs.pop('enable_thinking', False)
+        thinking = thinking or enable_thinking
+
+        drop_thinking = kwargs.pop('drop_thinking', None)
+        if drop_thinking is None:
+            drop_thinking = bool(messages and messages[-1].get('role') == 'user')
+
+        return encode_messages(messages,
+                               thinking_mode='thinking' if thinking else 'chat',
+                               drop_thinking=drop_thinking,
+                               add_default_bos_token=sequence_start)
+
+    @classmethod
+    def match(cls, model_path: str, trust_remote_code: bool = False, **kwargs) -> str | None:
+        try:
+            arch, cfg = get_model_arch(model_path, trust_remote_code=trust_remote_code)
+            cfg_dict = cfg.to_dict()
+        except Exception:
+            cfg_dict = {}
+            config_path = os.path.join(model_path, 'config.json')
+            if os.path.exists(config_path):
+                try:
+                    with open(config_path, encoding='utf-8') as f:
+                        cfg_dict = json.load(f)
+                except Exception:
+                    cfg_dict = {}
+            arch = (cfg_dict.get('architectures') or [None])[0]
+
+        if arch == 'DeepseekV32ForCausalLM' or cfg_dict.get('model_type') == 'deepseek_v32':
+            return 'deepseek-v32'
+        return None
+
+
 @MODELS.register_module(name=['llava-chatml'])
 class ChatmlDirect(BaseChatTemplate):
 
diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py
index 68d4c51f85..44bfeacc45 100644
--- a/lmdeploy/serve/core/async_engine.py
+++ b/lmdeploy/serve/core/async_engine.py
@@ -481,7 +481,7 @@ async def generate(
             session_id: int | Session,
             gen_config: GenerationConfig | None = None,
             tools: list[object] | None = None,
-            reasoning_effort: Literal['low', 'medium', 'high'] | None = None,
+            reasoning_effort: Literal['low', 'medium', 'high', 'max'] | None = None,
             stream_response: bool = True,
             sequence_start: bool = True,
             sequence_end: bool = True,  # no interactive mode by default
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index 675cbf5103..72eb841cf5 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -172,7 +172,7 @@ class ChatCompletionRequest(BaseModel):
     presence_penalty: float | None = 0.0
     frequency_penalty: float | None = 0.0
     user: str | None = None
-    reasoning_effort: Literal['low', 'medium', 'high'] | None = None
+    reasoning_effort: Literal['low', 'medium', 'high', 'max'] | None = None
     response_format: ResponseFormat | None = Field(default=None, examples=[None])
     # additional argument of lmdeploy
     do_preprocess: bool | None = True
diff --git a/lmdeploy/serve/parsers/reasoning_parser/__init__.py b/lmdeploy/serve/parsers/reasoning_parser/__init__.py
index 1f29020d7f..7b08f76525 100644
--- a/lmdeploy/serve/parsers/reasoning_parser/__init__.py
+++ b/lmdeploy/serve/parsers/reasoning_parser/__init__.py
@@ -1,5 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .deepseek_v3_reasoning_parser import DeepSeekV3ReasoningParser
+from .deepseek_v4_reasoning_parser import DeepSeekV4ReasoningParser
+from .deepseek_v32_reasoning_parser import DeepSeekV32ReasoningParser
 from .reasoning_parser import LEGACY_REASONING_PARSER_NAMES, ReasoningParser, ReasoningParserManager
 
 __all__ = [
@@ -7,4 +9,6 @@
     'ReasoningParser',
     'ReasoningParserManager',
     'DeepSeekV3ReasoningParser',
+    'DeepSeekV32ReasoningParser',
+    'DeepSeekV4ReasoningParser',
 ]
diff --git a/lmdeploy/serve/parsers/reasoning_parser/deepseek_v32_reasoning_parser.py b/lmdeploy/serve/parsers/reasoning_parser/deepseek_v32_reasoning_parser.py
new file mode 100644
index 0000000000..b00f74ed34
--- /dev/null
+++ b/lmdeploy/serve/parsers/reasoning_parser/deepseek_v32_reasoning_parser.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .reasoning_parser import ReasoningParser, ReasoningParserManager
+
+
+@ReasoningParserManager.register_module(['deepseek-v32', 'deepseek-v3.2'])
+class DeepSeekV32ReasoningParser(ReasoningParser):
+    """Reasoning parser for DeepSeek-V3.2 thinking mode."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.thinking = kwargs.get('thinking', None)
+        self.enable_thinking = kwargs.get('enable_thinking', None)
+
+    def starts_in_reasoning_mode(self) -> bool:
+        return self.thinking is True or self.enable_thinking is True
diff --git a/lmdeploy/serve/parsers/reasoning_parser/deepseek_v4_reasoning_parser.py b/lmdeploy/serve/parsers/reasoning_parser/deepseek_v4_reasoning_parser.py
new file mode 100644
index 0000000000..a6ed06c1f8
--- /dev/null
+++ b/lmdeploy/serve/parsers/reasoning_parser/deepseek_v4_reasoning_parser.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .reasoning_parser import ReasoningParser, ReasoningParserManager
+
+
+@ReasoningParserManager.register_module('deepseek-v4')
+class DeepSeekV4ReasoningParser(ReasoningParser):
+    """Reasoning parser for DeepSeek-V4 thinking mode."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.thinking = kwargs.get('thinking', None)
+        self.enable_thinking = kwargs.get('enable_thinking', None)
+
+    def starts_in_reasoning_mode(self) -> bool:
+        return self.thinking is True or self.enable_thinking is True
diff --git a/lmdeploy/serve/parsers/response_parser.py b/lmdeploy/serve/parsers/response_parser.py
index dff1bc4b91..f7bce1aba7 100644
--- a/lmdeploy/serve/parsers/response_parser.py
+++ b/lmdeploy/serve/parsers/response_parser.py
@@ -274,6 +274,8 @@ def chat_template_kwargs_from_request(cls, request: ChatCompletionRequest) -> di
             else:
                 logger.warning(
                     '`enable_thinking` in `chat_template_kwargs` will override the value in request.')
+        if request.reasoning_effort in ('high', 'max'):
+            chat_template_kwargs.setdefault('reasoning_effort', request.reasoning_effort)
         return chat_template_kwargs
 
     def __init__(self, request: ChatCompletionRequest):
@@ -281,6 +283,8 @@ def __init__(self, request: ChatCompletionRequest):
         tcls = type(self).tool_parser_cls
         self._kwargs = type(self).chat_template_kwargs_from_request(request)
         self.enable_thinking: bool | None = self._kwargs.get('enable_thinking', None)
+        if self._kwargs.get('thinking') is True:
+            self.enable_thinking = True
         self.reasoning_parser: ReasoningParser | None = rcls(**self._kwargs) if rcls else None
         self.tool_parser: ToolParser | None = tcls() if tcls else None
         if self.tool_parser is not None:
@@ -672,8 +676,11 @@ def parse_complete(
                 close_idx = n
                 tool_payload = text[open_idx + len(open_tag):].strip()
             parsed_call = self.tool_parser.parse_tool_call_complete(tool_payload) if self.tool_parser else None
-            if parsed_call is not None:
-                tool_calls.append(parsed_call)
+            if parsed_call:
+                if isinstance(parsed_call, list):
+                    tool_calls.extend(parsed_call)
+                else:
+                    tool_calls.append(parsed_call)
                 pos = close_idx + len(close_tag) if close_tag else n
             else:
                 # Tool call parsing failed — fall back to plain text.
diff --git a/lmdeploy/serve/parsers/tool_parser/__init__.py b/lmdeploy/serve/parsers/tool_parser/__init__.py
index f5c547ac1e..f9c5ceaba7 100644
--- a/lmdeploy/serve/parsers/tool_parser/__init__.py
+++ b/lmdeploy/serve/parsers/tool_parser/__init__.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .deepseek_v4_tool_parser import DeepSeekV4ToolParser
+from .deepseek_v32_tool_parser import DeepSeekV32ToolParser
 from .glm47_tool_parser import Glm47ToolParser
 from .internlm2_tool_parser import Internlm2ToolParser
 from .interns2preview_tool_parser import InternS2PreviewToolParser
@@ -13,6 +15,8 @@
     'ToolParser',
     'ToolParserManager',
     'XmlToolParser',
+    'DeepSeekV32ToolParser',
+    'DeepSeekV4ToolParser',
     'Glm47ToolParser',
     'Internlm2ToolParser',
     'Llama3JsonToolParser',
diff --git a/lmdeploy/serve/parsers/tool_parser/deepseek_v32_tool_parser.py b/lmdeploy/serve/parsers/tool_parser/deepseek_v32_tool_parser.py
new file mode 100644
index 0000000000..62f7f03a1d
--- /dev/null
+++ b/lmdeploy/serve/parsers/tool_parser/deepseek_v32_tool_parser.py
@@ -0,0 +1,107 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from __future__ import annotations
+
+import shortuuid
+
+from lmdeploy.deepseek_v32_encoding import dsml_token, parse_tool_calls
+from lmdeploy.serve.openai.protocol import (
+    DeltaFunctionCall,
+    DeltaToolCall,
+    FunctionCall,
+    ToolCall,
+)
+
+from .tool_parser import ToolParser, ToolParserManager
+
+TOOL_CALLS_BLOCK_NAME = 'function_calls'
+
+
+@ToolParserManager.register_module(['deepseek-v32', 'deepseek-v3.2'])
+class DeepSeekV32ToolParser(ToolParser):
+    """Tool parser for DeepSeek-V3.2 DSML function-call blocks."""
+
+    dsml_token = dsml_token
+    tool_calls_block_name = TOOL_CALLS_BLOCK_NAME
+    parse_tool_calls_func = staticmethod(parse_tool_calls)
+
+    @classmethod
+    def get_tool_open_tag(cls) -> str | None:
+        return f'\n\n<{cls.dsml_token}{cls.tool_calls_block_name}>'
+
+    @classmethod
+    def get_tool_close_tag(cls) -> str | None:
+        return f'</{cls.dsml_token}{cls.tool_calls_block_name}>'
+
+    @classmethod
+    def get_tool_payload_format(cls) -> str:
+        return 'dsml'
+
+    def decode_tool_incremental(self, added_text: str, *, final: bool) -> list[DeltaToolCall]:
+        self._tool_payload += added_text
+        if not final:
+            return []
+
+        tool_calls = self.parse_tool_call_complete(self._tool_payload)
+        if not tool_calls:
+            return []
+
+        out: list[DeltaToolCall] = []
+        for offset, tool_call in enumerate(tool_calls):
+            index = self._active_tool_index + offset
+            out.append(
+                DeltaToolCall(
+                    id=f'chatcmpl-tool-{shortuuid.random()}',
+                    index=index,
+                    type='function',
+                    function=DeltaFunctionCall(name=tool_call.function.name),
+                ))
+            out.append(
+                DeltaToolCall(
+                    id=None,
+                    index=index,
+                    type=None,
+                    function=DeltaFunctionCall(arguments=tool_call.function.arguments),
+                ))
+
+        self._active_tool_index += len(tool_calls) - 1
+        return out
+
+    def parse_tool_call_complete(self, payload: str) -> list[ToolCall] | None:
+        payload = payload.strip()
+        if not payload:
+            return None
+
+        wrapped = f'{self.get_tool_open_tag()}\n{payload}\n{self.get_tool_close_tag()}'
+        start = len(self.get_tool_open_tag()) - 1
+        try:
+            _, stop_token, raw_tool_calls = self.parse_tool_calls_func(start, wrapped)
+        except Exception:
+            return None
+        if stop_token != self.get_tool_close_tag() or not raw_tool_calls:
+            return None
+
+        return [
+            ToolCall(function=FunctionCall(name=tool_call['name'], arguments=tool_call['arguments']))
+            for tool_call in raw_tool_calls
+        ]
+
+    def validate_complete(self, text: str) -> bool:
+        open_tag = self.get_tool_open_tag()
+        close_tag = self.get_tool_close_tag()
+
+        pos = 0
+        while True:
+            open_idx = text.find(open_tag, pos)
+            close_idx = text.find(close_tag, pos)
+            if open_idx < 0:
+                return close_idx < 0
+
+            payload_start = open_idx + len(open_tag)
+            if close_idx < payload_start:
+                return False
+            if self.parse_tool_call_complete(text[payload_start:close_idx]) is None:
+                return False
+
+            pos = close_idx + len(close_tag)
+            if pos >= len(text):
+                return True
diff --git a/lmdeploy/serve/parsers/tool_parser/deepseek_v4_tool_parser.py b/lmdeploy/serve/parsers/tool_parser/deepseek_v4_tool_parser.py
new file mode 100644
index 0000000000..9ee633b741
--- /dev/null
+++ b/lmdeploy/serve/parsers/tool_parser/deepseek_v4_tool_parser.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from __future__ import annotations
+
+from lmdeploy.deepseek_v4_encoding import dsml_token, parse_tool_calls, tool_calls_block_name
+
+from .deepseek_v32_tool_parser import DeepSeekV32ToolParser
+from .tool_parser import ToolParserManager
+
+
+@ToolParserManager.register_module(['deepseek-v4'])
+class DeepSeekV4ToolParser(DeepSeekV32ToolParser):
+    """Tool parser for DeepSeek-V4 DSML tool-call blocks."""
+
+    dsml_token = dsml_token
+    tool_calls_block_name = tool_calls_block_name
+    parse_tool_calls_func = staticmethod(parse_tool_calls)
diff --git a/lmdeploy/serve/parsers/tool_parser/tool_parser.py b/lmdeploy/serve/parsers/tool_parser/tool_parser.py
index f95b662a12..33b19cbd38 100644
--- a/lmdeploy/serve/parsers/tool_parser/tool_parser.py
+++ b/lmdeploy/serve/parsers/tool_parser/tool_parser.py
@@ -73,7 +73,7 @@ def decode_tool_incremental(self, added_text: str, *, final: bool) -> list[Delta
         """Decode incremental tool payload emitted between tool tags."""
         raise NotImplementedError('ToolParser.decode_tool_incremental has not been implemented!')
 
-    def parse_tool_call_complete(self, payload: str) -> ToolCall | None:
+    def parse_tool_call_complete(self, payload: str) -> ToolCall | list[ToolCall] | None:
         """Parse one complete tool payload into OpenAI tool call object."""
         raise NotImplementedError('ToolParser.parse_tool_call_complete has not been implemented!')
 
diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py
index cf2452935e..4917031787 100644
--- a/lmdeploy/serve/processors/multimodal.py
+++ b/lmdeploy/serve/processors/multimodal.py
@@ -186,7 +186,7 @@ async def get_prompt_input(self,
                                sequence_start: bool,
                                adapter_name: str,
                                tools: list[object] | None = None,
-                               reasoning_effort: Literal['low', 'medium', 'high'] | None = None,
+                               reasoning_effort: Literal['low', 'medium', 'high', 'max'] | None = None,
                                chat_template_kwargs: dict | None = None,
                                media_io_kwargs: dict[str, Any] | None = None,
                                mm_processor_kwargs: dict[str, Any] | None = None,
@@ -346,7 +346,7 @@ async def _get_text_prompt_input(self,
                                      sequence_start: bool,
                                      adapter_name: str,
                                      tools: list[object] | None = None,
-                                     reasoning_effort: Literal['low', 'medium', 'high'] | None = None,
+                                     reasoning_effort: Literal['low', 'medium', 'high', 'max'] | None = None,
                                      chat_template_kwargs: dict | None = None,
                                      **kwargs):
         """Process text-only prompt and return prompt string and input_ids."""
diff --git a/tests/test_lmdeploy/test_deepseek_v32_encoding.py b/tests/test_lmdeploy/test_deepseek_v32_encoding.py
new file mode 100644
index 0000000000..64e6723ea7
--- /dev/null
+++ b/tests/test_lmdeploy/test_deepseek_v32_encoding.py
@@ -0,0 +1,254 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+
+from lmdeploy.deepseek_v32_encoding import (
+    bos_token,
+    encode_messages,
+    eos_token,
+    parse_message_from_completion_text,
+)
+from lmdeploy.model import MODELS, DeepseekV32ChatTemplate, get_chat_template
+from lmdeploy.serve.openai.protocol import ChatCompletionRequest
+from lmdeploy.serve.parsers import ResponseParserManager
+from lmdeploy.serve.parsers.reasoning_parser import ReasoningParserManager
+from lmdeploy.serve.parsers.tool_parser import ToolParserManager
+
+WEATHER_TOOL = {
+    'type': 'function',
+    'function': {
+        'name': 'get_weather',
+        'description': 'Get weather for a city.',
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'city': {
+                    'type': 'string'
+                }
+            },
+            'required': ['city'],
+        },
+    },
+}
+
+
+def test_deepseek_v32_minimal_chat_and_thinking_modes():
+    messages = [{'role': 'user', 'content': 'Hello'}]
+
+    assert encode_messages(messages, thinking_mode='chat') == (
+        f'{bos_token}<｜User｜>Hello<｜Assistant｜></think>'
+    )
+    assert encode_messages(messages, thinking_mode='thinking') == (
+        f'{bos_token}<｜User｜>Hello<｜Assistant｜><think>'
+    )
+
+
+def test_deepseek_v32_uses_function_call_block():
+    messages = [
+        {
+            'role': 'system',
+            'content': 'You may call tools.',
+            'tools': [WEATHER_TOOL],
+        },
+        {
+            'role': 'user',
+            'content': 'Weather in Paris?',
+        },
+        {
+            'role': 'assistant',
+            'reasoning_content': 'I should call the weather tool.',
+            'tool_calls': [{
+                'type': 'function',
+                'function': {
+                    'name': 'get_weather',
+                    'arguments': '{"city": "Paris"}',
+                },
+            }],
+        },
+    ]
+
+    prompt = encode_messages(messages, thinking_mode='thinking', drop_thinking=False)
+
+    assert '## Tools' in prompt
+    assert '"name": "get_weather"' in prompt
+    assert '<｜DSML｜function_calls>' in prompt
+    assert '</｜DSML｜function_calls>' in prompt
+    assert '<｜DSML｜tool_calls>' not in prompt
+    assert '<｜DSML｜parameter name="city" string="true">Paris' in prompt
+
+
+def test_deepseek_v32_tool_results_reopen_thinking():
+    messages = [
+        {
+            'role': 'user',
+            'content': 'Weather in Paris?',
+        },
+        {
+            'role': 'assistant',
+            'tool_calls': [{
+                'id': 'call_1',
+                'type': 'function',
+                'function': {
+                    'name': 'get_weather',
+                    'arguments': '{"city": "Paris"}',
+                },
+            }],
+        },
+        {
+            'role': 'tool',
+            'tool_call_id': 'call_1',
+            'content': 'Sunny',
+        },
+    ]
+
+    prompt = encode_messages(messages, thinking_mode='thinking', drop_thinking=False)
+
+    assert '<function_results>\n<result>Sunny</result>\n</function_results>\n\n<think>' in prompt
+
+
+def test_deepseek_v32_parse_completion_text():
+    completion = (
+        'I should call a tool.</think>\n\n'
+        '<｜DSML｜function_calls>\n'
+        '<｜DSML｜invoke name="get_weather">\n'
+        '<｜DSML｜parameter name="city" string="true">Paris</｜DSML｜parameter>\n'
+        '</｜DSML｜invoke>\n'
+        '</｜DSML｜function_calls>'
+        f'{eos_token}'
+    )
+
+    parsed = parse_message_from_completion_text(completion, thinking_mode='thinking')
+
+    assert parsed['reasoning_content'] == 'I should call a tool.'
+    assert parsed['content'] == ''
+    assert parsed['tool_calls'][0]['function']['name'] == 'get_weather'
+    assert json.loads(parsed['tool_calls'][0]['function']['arguments']) == {'city': 'Paris'}
+
+
+def test_deepseek_v32_chat_template_uses_vllm_thinking_switches():
+    model = MODELS.get('deepseek-v32')()
+    assert model.messages2prompt([{'role': 'user', 'content': 'Hello'}]) == (
+        f'{bos_token}<｜User｜>Hello<｜Assistant｜></think>'
+    )
+    assert model.messages2prompt([{'role': 'user', 'content': 'Hello'}], thinking=True) == (
+        f'{bos_token}<｜User｜>Hello<｜Assistant｜><think>'
+    )
+    assert model.messages2prompt([{'role': 'user', 'content': 'Hello'}], enable_thinking=True) == (
+        f'{bos_token}<｜User｜>Hello<｜Assistant｜><think>'
+    )
+
+
+def test_deepseek_v32_chat_template_normalizes_lmdeploy_tools_and_dict_arguments():
+    model = MODELS.get('deepseek-v32')()
+    prompt = model.messages2prompt(
+        [
+            {'role': 'user', 'content': 'List files'},
+            {
+                'role': 'assistant',
+                'tool_calls': [
+                    {
+                        'type': 'function',
+                        'function': {
+                            'name': 'str_replace_editor',
+                            'arguments': {
+                                'command': 'view',
+                                'path': '/testbed',
+                            },
+                        },
+                    }
+                ],
+            },
+        ],
+        tools=[
+            {
+                'name': 'str_replace_editor',
+                'description': 'Edit files',
+                'parameters': {
+                    'type': 'object',
+                    'properties': {
+                        'command': {
+                            'type': 'string'
+                        },
+                        'path': {
+                            'type': 'string'
+                        },
+                    },
+                    'required': ['command', 'path'],
+                },
+            }
+        ],
+        enable_thinking=True,
+        drop_thinking=False,
+    )
+    assert '## Tools' in prompt
+    assert '<｜DSML｜function_calls>' in prompt
+    assert '<｜DSML｜tool_calls>' not in prompt
+    assert '"name": "str_replace_editor"' in prompt
+    assert '<｜DSML｜parameter name="command" string="true">view' in prompt
+    assert '<｜DSML｜parameter name="path" string="true">/testbed' in prompt
+    assert 'parameter name="arguments"' not in prompt
+
+
+def test_deepseek_v32_chat_template_match_minimal_config(tmp_path):
+    (tmp_path / 'config.json').write_text(
+        json.dumps({
+            'model_type': 'deepseek_v32',
+            'architectures': ['DeepseekV32ForCausalLM'],
+        }),
+        encoding='utf-8',
+    )
+    assert DeepseekV32ChatTemplate.match(str(tmp_path)) == 'deepseek-v32'
+    assert isinstance(get_chat_template(str(tmp_path)), DeepseekV32ChatTemplate)
+
+
+def _make_response_parser(thinking=True):
+    cls = ResponseParserManager.get('default')
+    cls.reasoning_parser_cls = ReasoningParserManager.get('deepseek-v32')
+    cls.tool_parser_cls = ToolParserManager.get('deepseek-v32')
+    request = ChatCompletionRequest(
+        model='deepseek-ai/DeepSeek-V3.2',
+        messages=[],
+        stream=True,
+        chat_template_kwargs={'thinking': thinking},
+    )
+    return cls(request=request)
+
+
+def test_deepseek_v32_response_parser_complete_dsml_function_calls():
+    completion = (
+        'I should call a tool.</think>\n\n'
+        '<｜DSML｜function_calls>\n'
+        '<｜DSML｜invoke name="get_weather">\n'
+        '<｜DSML｜parameter name="city" string="true">Paris</｜DSML｜parameter>\n'
+        '</｜DSML｜invoke>\n'
+        '</｜DSML｜function_calls>'
+    )
+
+    parser = _make_response_parser(thinking=True)
+    content, tool_calls, reasoning_content = parser.parse_complete(completion)
+    assert content is None
+    assert reasoning_content == 'I should call a tool.'
+    assert tool_calls is not None
+    assert len(tool_calls) == 1
+    assert tool_calls[0].function.name == 'get_weather'
+    assert json.loads(tool_calls[0].function.arguments) == {'city': 'Paris'}
+    assert parser.validate_complete(completion)
+
+
+def test_deepseek_v32_response_parser_streaming_dsml_function_calls():
+    text = (
+        'need data</think>\n\n'
+        '<｜DSML｜function_calls>\n'
+        '<｜DSML｜invoke name="search">\n'
+        '<｜DSML｜parameter name="query" string="true">DeepSeek V3.2</｜DSML｜parameter>\n'
+        '</｜DSML｜invoke>\n'
+        '</｜DSML｜function_calls>'
+    )
+    parser = _make_response_parser(thinking=True)
+
+    deltas = parser.stream_chunk(delta_text=text, delta_token_ids=[])
+    reasoning = ''.join(delta.reasoning_content or '' for delta, _ in deltas)
+    tool_deltas = [tool_call for delta, _ in deltas for tool_call in (delta.tool_calls or [])]
+
+    assert reasoning == 'need data'
+    assert tool_deltas[0].function.name == 'search'
+    assert json.loads(tool_deltas[1].function.arguments) == {'query': 'DeepSeek V3.2'}
diff --git a/tests/test_lmdeploy/test_deepseek_v4_encoding.py b/tests/test_lmdeploy/test_deepseek_v4_encoding.py
new file mode 100644
index 0000000000..5acdad967f
--- /dev/null
+++ b/tests/test_lmdeploy/test_deepseek_v4_encoding.py
@@ -0,0 +1,274 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+
+from lmdeploy.deepseek_v4_encoding import (
+    REASONING_EFFORT_MAX,
+    bos_token,
+    encode_messages,
+    eos_token,
+    parse_message_from_completion_text,
+)
+from lmdeploy.model import MODELS, DeepseekV4ChatTemplate, get_chat_template
+from lmdeploy.serve.openai.protocol import ChatCompletionRequest
+from lmdeploy.serve.parsers import ResponseParserManager
+from lmdeploy.serve.parsers.reasoning_parser import ReasoningParserManager
+from lmdeploy.serve.parsers.tool_parser import ToolParserManager
+
+WEATHER_TOOL = {
+    'type': 'function',
+    'function': {
+        'name': 'get_weather',
+        'description': 'Get weather for a city.',
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'city': {
+                    'type': 'string'
+                }
+            },
+            'required': ['city'],
+        },
+    },
+}
+
+
+def test_deepseek_v4_minimal_chat_and_thinking_modes():
+    messages = [{'role': 'user', 'content': 'Hello'}]
+
+    assert encode_messages(messages, thinking_mode='chat') == (
+        f'{bos_token}<｜User｜>Hello<｜Assistant｜></think>'
+    )
+    assert encode_messages(messages, thinking_mode='thinking') == (
+        f'{bos_token}<｜User｜>Hello<｜Assistant｜><think>'
+    )
+
+
+def test_deepseek_v4_uses_v4_tool_call_block():
+    messages = [
+        {
+            'role': 'system',
+            'content': 'You may call tools.',
+            'tools': [WEATHER_TOOL],
+        },
+        {
+            'role': 'user',
+            'content': 'Weather in Paris?',
+        },
+        {
+            'role': 'assistant',
+            'reasoning_content': 'I should call the weather tool.',
+            'tool_calls': [{
+                'type': 'function',
+                'function': {
+                    'name': 'get_weather',
+                    'arguments': '{"city": "Paris"}',
+                },
+            }],
+        },
+    ]
+
+    prompt = encode_messages(messages, thinking_mode='thinking', drop_thinking=False)
+
+    assert '## Tools' in prompt
+    assert '"name": "get_weather"' in prompt
+    assert '<｜DSML｜tool_calls>' in prompt
+    assert '</｜DSML｜tool_calls>' in prompt
+    assert '<｜DSML｜function_calls>' not in prompt
+    assert '<｜DSML｜parameter name="city" string="true">Paris' in prompt
+
+
+def test_deepseek_v4_merges_tool_results_into_user_blocks():
+    messages = [
+        {
+            'role': 'user',
+            'content': 'Weather in Paris?',
+        },
+        {
+            'role': 'assistant',
+            'tool_calls': [{
+                'id': 'call_1',
+                'type': 'function',
+                'function': {
+                    'name': 'get_weather',
+                    'arguments': '{"city": "Paris"}',
+                },
+            }],
+        },
+        {
+            'role': 'tool',
+            'tool_call_id': 'call_1',
+            'content': 'Sunny',
+        },
+    ]
+
+    prompt = encode_messages(messages, thinking_mode='chat')
+
+    assert '<tool_result>Sunny</tool_result>' in prompt
+    assert prompt.index('<｜DSML｜tool_calls>') < prompt.index('<tool_result>Sunny</tool_result>')
+
+
+def test_deepseek_v4_task_and_latest_reminder_rendering():
+    prompt = encode_messages(
+        [
+            {
+                'role': 'latest_reminder',
+                'content': 'Be terse.',
+            },
+            {
+                'role': 'user',
+                'content': 'Classify this page.',
+                'task': 'domain',
+            },
+        ],
+        thinking_mode='chat',
+    )
+
+    assert '<｜latest_reminder｜>Be terse.' in prompt
+    assert '<｜User｜>Classify this page.<｜domain｜>' in prompt
+    assert '<｜Assistant｜>' not in prompt
+
+
+def test_deepseek_v4_parse_completion_text():
+    completion = (
+        'I should call a tool.</think>\n\n'
+        '<｜DSML｜tool_calls>\n'
+        '<｜DSML｜invoke name="get_weather">\n'
+        '<｜DSML｜parameter name="city" string="true">Paris</｜DSML｜parameter>\n'
+        '</｜DSML｜invoke>\n'
+        '</｜DSML｜tool_calls>'
+        f'{eos_token}'
+    )
+
+    parsed = parse_message_from_completion_text(completion, thinking_mode='thinking')
+
+    assert parsed['reasoning_content'] == 'I should call a tool.'
+    assert parsed['content'] == ''
+    assert parsed['tool_calls'][0]['function']['name'] == 'get_weather'
+    assert json.loads(parsed['tool_calls'][0]['function']['arguments']) == {'city': 'Paris'}
+
+
+def test_deepseek_v4_chat_template_normalizes_lmdeploy_tools():
+    model = MODELS.get('deepseek-v4')()
+    prompt = model.messages2prompt(
+        [
+            {'role': 'system', 'content': 'You are a helpful assistant.'},
+            {'role': 'user', 'content': "What's the weather in Beijing?"},
+        ],
+        tools=[
+            {
+                'name': 'get_weather',
+                'description': 'Get weather for a location.',
+                'parameters': {
+                    'type': 'object',
+                    'properties': {
+                        'location': {
+                            'type': 'string'
+                        }
+                    },
+                    'required': ['location'],
+                },
+            }
+        ],
+        enable_thinking=True,
+        reasoning_effort='max',
+    )
+    assert prompt.startswith(bos_token + REASONING_EFFORT_MAX)
+    assert '## Tools' in prompt
+    assert '"name": "get_weather"' in prompt
+    assert prompt.endswith('<｜Assistant｜><think>')
+
+
+def test_deepseek_v4_reasoning_effort_does_not_enable_thinking():
+    model = MODELS.get('deepseek-v4')()
+    prompt = model.messages2prompt(
+        [{'role': 'user', 'content': 'Hello'}],
+        reasoning_effort='max',
+    )
+    assert REASONING_EFFORT_MAX not in prompt
+    assert prompt == f'{bos_token}<｜User｜>Hello<｜Assistant｜></think>'
+
+
+def test_deepseek_v4_chat_template_match_minimal_config(tmp_path):
+    (tmp_path / 'config.json').write_text(
+        json.dumps({
+            'model_type': 'deepseek_v4',
+            'architectures': ['DeepseekV4ForCausalLM'],
+        }),
+        encoding='utf-8',
+    )
+    assert DeepseekV4ChatTemplate.match(str(tmp_path)) == 'deepseek-v4'
+    assert isinstance(get_chat_template(str(tmp_path)), DeepseekV4ChatTemplate)
+
+
+def _make_response_parser(thinking=True):
+    cls = ResponseParserManager.get('default')
+    cls.reasoning_parser_cls = ReasoningParserManager.get('deepseek-v4')
+    cls.tool_parser_cls = ToolParserManager.get('deepseek-v4')
+    request = ChatCompletionRequest(
+        model='deepseek-ai/DeepSeek-V4',
+        messages=[],
+        stream=True,
+        chat_template_kwargs={'thinking': thinking},
+    )
+    return cls(request=request)
+
+
+def test_deepseek_v4_response_parser_complete_dsml_tool_call():
+    completion = (
+        'I should call a tool.</think>\n\n'
+        '<｜DSML｜tool_calls>\n'
+        '<｜DSML｜invoke name="get_weather">\n'
+        '<｜DSML｜parameter name="city" string="true">Paris</｜DSML｜parameter>\n'
+        '</｜DSML｜invoke>\n'
+        '</｜DSML｜tool_calls>'
+    )
+
+    parser = _make_response_parser(thinking=True)
+    content, tool_calls, reasoning_content = parser.parse_complete(completion)
+    assert content is None
+    assert reasoning_content == 'I should call a tool.'
+    assert tool_calls is not None
+    assert len(tool_calls) == 1
+    assert tool_calls[0].function.name == 'get_weather'
+    assert json.loads(tool_calls[0].function.arguments) == {'city': 'Paris'}
+    assert parser.validate_complete(completion)
+
+
+def test_deepseek_v4_response_parser_streaming_dsml_tool_call():
+    text = (
+        'need a tool</think>\n\n'
+        '<｜DSML｜tool_calls>\n'
+        '<｜DSML｜invoke name="search">\n'
+        '<｜DSML｜parameter name="query" string="true">DeepSeek V4</｜DSML｜parameter>\n'
+        '</｜DSML｜invoke>\n'
+        '</｜DSML｜tool_calls>'
+    )
+    parser = _make_response_parser(thinking=True)
+
+    deltas = parser.stream_chunk(delta_text=text, delta_token_ids=[])
+    reasoning = ''.join(delta.reasoning_content or '' for delta, _ in deltas)
+    tool_deltas = [tool_call for delta, _ in deltas for tool_call in (delta.tool_calls or [])]
+
+    assert reasoning == 'need a tool'
+    assert tool_deltas[0].function.name == 'search'
+    assert json.loads(tool_deltas[1].function.arguments) == {'query': 'DeepSeek V4'}
+
+
+def test_deepseek_v4_response_parser_reasoning_effort_does_not_enable_thinking():
+    cls = ResponseParserManager.get('default')
+    cls.reasoning_parser_cls = ReasoningParserManager.get('deepseek-v4')
+    cls.tool_parser_cls = None
+    request = ChatCompletionRequest(
+        model='deepseek-ai/DeepSeek-V4',
+        messages=[],
+        stream=True,
+        reasoning_effort='max',
+    )
+    parser = cls(request=request)
+
+    deltas = parser.stream_chunk(delta_text='hello', delta_token_ids=[])
+    assert len(deltas) == 1
+    delta, tool_emitted = deltas[0]
+    assert tool_emitted is False
+    assert delta.content == 'hello'
+    assert delta.reasoning_content is None