feat(gemma): add tool call parser

qinxuye · qinxuye · commit 6f3afc24fb47 · 2026-04-12T12:07:23.000+08:00
diff --git a/xinference/model/llm/mlx/core.py b/xinference/model/llm/mlx/core.py
@@ -53,6 +53,7 @@
 from ..llm_family import LLMFamilyV2, LLMSpecV1
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
+    GEMMA_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_FAMILY,
     ChatModelMixin,
     generate_completion_chunk,
@@ -1186,6 +1187,7 @@ async def async_chat(
         if tools:
             if (
                 model_family in QWEN_TOOL_CALL_FAMILY
+                or model_family in GEMMA_TOOL_CALL_FAMILY
                 or model_family in DEEPSEEK_TOOL_CALL_FAMILY
             ):
                 full_context_kwargs["tools"] = tools
@@ -1547,7 +1549,10 @@ def chat(
             )
             chat_context_var.set(chat_template_kwargs)
             full_context_kwargs = chat_template_kwargs.copy()
-            if tools and model_family in QWEN_TOOL_CALL_FAMILY:
+            if tools and (
+                model_family in QWEN_TOOL_CALL_FAMILY
+                or model_family in GEMMA_TOOL_CALL_FAMILY
+            ):
                 full_context_kwargs["tools"] = tools
             chat_template = self.model_family.chat_template
             tokenizer = None
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
@@ -38,6 +38,7 @@
 from ..core import chat_context_var
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
+    GEMMA_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_SYMBOLS,
     ChatModelMixin,
@@ -730,6 +731,7 @@ async def async_chat(
         if tools:
             if (
                 model_family in QWEN_TOOL_CALL_FAMILY
+                or model_family in GEMMA_TOOL_CALL_FAMILY
                 or model_family in DEEPSEEK_TOOL_CALL_FAMILY
             ):
                 full_context_kwargs["tools"] = tools
diff --git a/xinference/model/llm/tool_parsers/__init__.py b/xinference/model/llm/tool_parsers/__init__.py
@@ -53,6 +53,7 @@ def decorator(cls: Type[Any]) -> Type[Any]:
     deepseek_r1_tool_parser,
     deepseek_v3_1_tool_parser,
     deepseek_v3_tool_parser,
+    gemma_tool_parser,
     glm4_tool_parser,
     llama3_tool_parser,
     minimax_tool_parser,
diff --git a/xinference/model/llm/tool_parsers/gemma_tool_parser.py b/xinference/model/llm/tool_parsers/gemma_tool_parser.py
@@ -0,0 +1,132 @@
+import json
+import logging
+import re
+from typing import Any, Dict, List, Optional, Tuple
+
+from . import register_tool_parser
+from .abstract_tool_parser import ToolParser
+
+logger = logging.getLogger(__name__)
+
+
+@register_tool_parser("gemma")
+class GemmaToolParser(ToolParser):
+    """
+    Tool parser for Gemma-4 style tool call blocks.
+
+    Gemma emits tool invocations using tokens like:
+        <|tool_call>call:get_weather{location:<|"|>Shanghai<|"|>}<tool_call|>
+    where strings are wrapped with <|"|> ... <|"|>.
+    """
+
+    def __init__(self):
+        self.tool_call_start_token = "<|tool_call>"
+        self.tool_call_end_token = "<tool_call|>"
+        self.tool_call_regex = re.compile(
+            r"(<\|tool_call\>.*?<tool_call\|>)", re.DOTALL
+        )
+        self.call_header_regex = re.compile(r"call\s*:\s*([^{\s]+)", re.IGNORECASE)
+
+    @staticmethod
+    def _replace_quotes(text: str) -> str:
+        return text.replace('<|"|>', '"')
+
+    @staticmethod
+    def _quote_keys(text: str) -> str:
+        pattern = re.compile(r"(?P<prefix>[{,])\s*(?P<key>[A-Za-z0-9_\-]+)\s*:")
+
+        def repl(match: re.Match) -> str:
+            prefix = match.group("prefix")
+            key = match.group("key")
+            return f'{prefix}"{key}":'
+
+        while True:
+            new_text, count = pattern.subn(repl, text)
+            text = new_text
+            if count == 0:
+                break
+        return text
+
+    def _parse_arguments(self, arg_block: str) -> Dict[str, Any]:
+        cleaned = self._replace_quotes(arg_block.strip())
+        if not cleaned:
+            return {}
+        normalized = self._quote_keys(cleaned)
+        return json.loads(normalized)
+
+    def _parse_tool_call_block(
+        self, block: str
+    ) -> Tuple[Optional[str], Optional[str], Optional[Dict[str, Any]]]:
+        content = block.strip()
+        try:
+            # Remove wrapper tokens
+            if content.startswith(self.tool_call_start_token):
+                content = content[len(self.tool_call_start_token) :]
+            if content.endswith(self.tool_call_end_token):
+                content = content[: -len(self.tool_call_end_token)]
+            content = content.strip()
+
+            match = self.call_header_regex.search(content)
+            if not match:
+                raise ValueError("Missing call header")
+            func_name = match.group(1).strip()
+
+            brace_start = content.find("{", match.end())
+            brace_end = content.rfind("}")
+            if brace_start == -1 or brace_end == -1 or brace_end < brace_start:
+                args = {}
+            else:
+                args_str = content[brace_start : brace_end + 1]
+                args = self._parse_arguments(args_str)
+            return (None, func_name, args)
+        except Exception as exc:
+            logger.warning("Failed to parse Gemma tool call: %s, error: %s", block, exc)
+            return (block, None, None)
+
+    def extract_tool_calls(
+        self, model_output: str
+    ) -> List[Tuple[Optional[str], Optional[str], Optional[Dict[str, Any]]]]:
+        if self.tool_call_start_token not in model_output:
+            return [(model_output, None, None)]
+
+        results: List[Tuple[Optional[str], Optional[str], Optional[Dict[str, Any]]]] = (
+            []
+        )
+        last_end = 0
+        for match in self.tool_call_regex.finditer(model_output):
+            if match.start() > last_end:
+                content = model_output[last_end : match.start()]
+                if content:
+                    results.append((content, None, None))
+            block = match.group(0)
+            results.append(self._parse_tool_call_block(block))
+            last_end = match.end()
+
+        if last_end < len(model_output):
+            remainder = model_output[last_end:]
+            if remainder:
+                results.append((remainder, None, None))
+
+        return results or [(model_output, None, None)]
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_texts: List[str],
+        current_text: str,
+        delta_text: str,
+    ) -> Optional[Tuple[Optional[str], Optional[str], Optional[Dict[str, Any]]]]:
+        if self.tool_call_start_token not in current_text:
+            return (delta_text, None, None)
+
+        matches = list(self.tool_call_regex.finditer(current_text))
+        if not matches:
+            return None
+
+        prev_text = previous_texts[-1] if previous_texts else ""
+        last_match = matches[-1]
+        if last_match.end() <= len(prev_text):
+            # The latest complete tool call was already processed, return delta as text
+            return (delta_text, None, None)
+
+        block = last_match.group(0)
+        return self._parse_tool_call_block(block)
diff --git a/xinference/model/llm/tool_parsers/tests/test_gemma_tool_parser.py b/xinference/model/llm/tool_parsers/tests/test_gemma_tool_parser.py
@@ -0,0 +1,48 @@
+import pytest
+
+from ..gemma_tool_parser import GemmaToolParser
+
+
+@pytest.fixture
+def parser():
+    return GemmaToolParser()
+
+
+def test_extract_tool_calls(parser):
+    output = (
+        "<|tool_call>call:get_weather"
+        '{location:<|"|>上海<|"|>,unit:<|"|>celsius<|"|>}'
+        "<tool_call|>"
+    )
+    result = parser.extract_tool_calls(output)
+    assert result == [(None, "get_weather", {"location": "上海", "unit": "celsius"})]
+
+
+def test_extract_tool_calls_with_surrounding_text(parser):
+    output = (
+        "Thought...\n"
+        "<|tool_call>call:get_weather"
+        '{location:<|"|>上海<|"|>}'
+        "<tool_call|>\nThanks"
+    )
+    result = parser.extract_tool_calls(output)
+    assert result == [
+        ("Thought...\n", None, None),
+        (None, "get_weather", {"location": "上海"}),
+        ("\nThanks", None, None),
+    ]
+
+
+def test_extract_tool_calls_streaming(parser):
+    previous = [""]
+    block = "<|tool_call>call:get_weather" '{location:<|"|>上海<|"|>}' "<tool_call|>"
+    result = parser.extract_tool_calls_streaming(previous, block, block)
+    assert result == (None, "get_weather", {"location": "上海"})
+
+
+def test_streaming_ignores_processed_block(parser):
+    block = "<|tool_call>call:get_weather" '{location:<|"|>上海<|"|>}' "<tool_call|>"
+    previous = [block]
+    current = block + " more text"
+    result = parser.extract_tool_calls_streaming(previous, current, " more text")
+    assert result == (" more text", None, None)
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
@@ -41,6 +41,7 @@
 from ..llm_family import LLMFamilyV2, LLMSpecV1
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
+    GEMMA_TOOL_CALL_FAMILY,
     LLAMA3_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_FAMILY,
     ChatModelMixin,
@@ -1079,9 +1080,9 @@ def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict):
         )
         chat_context_var.set(chat_template_kwargs)
         full_context_kwargs = chat_template_kwargs.copy()
-        if (
-            tools
-            and model_family in QWEN_TOOL_CALL_FAMILY
+        if tools and (
+            model_family in QWEN_TOOL_CALL_FAMILY
+            or model_family in GEMMA_TOOL_CALL_FAMILY
             or model_family in LLAMA3_TOOL_CALL_FAMILY
             or model_family in DEEPSEEK_TOOL_CALL_FAMILY
         ):
diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py
@@ -123,6 +123,8 @@ def get_context_length_from_config(
     "qwen3.5",
 ]
 
+GEMMA_TOOL_CALL_FAMILY = ["gemma-4"]
+
 GLM4_TOOL_CALL_FAMILY = [
     "glm4-chat",
     "glm4-chat-1m",
@@ -142,6 +144,7 @@ def get_context_length_from_config(
 
 TOOL_CALL_FAMILY = (
     QWEN_TOOL_CALL_FAMILY
+    + GEMMA_TOOL_CALL_FAMILY
     + GLM4_TOOL_CALL_FAMILY
     + LLAMA3_TOOL_CALL_FAMILY
     + DEEPSEEK_TOOL_CALL_FAMILY
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
@@ -60,6 +60,7 @@
 from ..llm_family import cache_model_tokenizer_and_config
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
+    GEMMA_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_SYMBOLS,
     ChatModelMixin,
@@ -1670,6 +1671,7 @@ async def async_chat(
         if tools:
             if (
                 model_family in QWEN_TOOL_CALL_FAMILY
+                or model_family in GEMMA_TOOL_CALL_FAMILY
                 or model_family in DEEPSEEK_TOOL_CALL_FAMILY
             ):
                 full_context_kwargs["tools"] = tools
@@ -1963,7 +1965,10 @@ async def async_chat(
             )
             chat_context_var.set(chat_template_kwargs)
             full_context_kwargs = chat_template_kwargs.copy()
-            if tools and model_family in QWEN_TOOL_CALL_FAMILY:
+            if tools and (
+                model_family in QWEN_TOOL_CALL_FAMILY
+                or model_family in GEMMA_TOOL_CALL_FAMILY
+            ):
                 full_context_kwargs["tools"] = tools
             assert self.model_family.chat_template is not None
             if "omni" in self.model_family.model_ability: