fix: round-trip Gemini 3 thought signatures through Vertex AI converter

major · major · commit 291b8d58fd7a · 2026-06-10T17:14:54.000-05:00
Gemini 3.x models (gemini-3-flash, gemini-3.5-flash) attach a
thought_signature to the first functionCall part of a tool-calling turn and
require it to be replayed verbatim on the next turn, or the request fails
with HTTP 400. llama-stack converts Gemini responses into the OpenAI
chat-completion shape, which has no field for the signature, so it is
dropped and every multi-turn tool call against a Gemini 3 model fails.

Monkeypatch llama-stack's vertexai converter at app import time. Both
wrappers defer entirely to the upstream originals and only smuggle the
base64-encoded signature in and out through the opaque tool-call id (which
llama-stack round-trips untouched and only ever compares for equality):
the extract wrapper re-pairs each functionCall part with the tool call the
original emitted and embeds the signature in its id; the assistant-message
wrapper decodes it back onto the rebuilt Gemini part.

The patch is idempotent and a no-op when the Vertex AI provider is not
installed. Remove it once the fix lands upstream.

Signed-off-by: Major Hayden &lt;major@redhat.com&gt;
diff --git a/src/app/main.py b/src/app/main.py
@@ -27,11 +27,20 @@
 from sentry import initialize_sentry
 from utils.common import register_mcp_servers_async
 from utils.llama_stack_version import check_llama_stack_version
+from utils.vertexai_thought_signature import (
+    apply_patch as apply_vertexai_thought_signature_patch,
+)
 
 logger = get_logger(__name__)
 
 logger.info("Initializing app")
 
+# DOWNSTREAM PATCH: carry Gemini 3 thought signatures through llama-stack's
+# Vertex AI converter so multi-turn tool calls against gemini-3.x models do not
+# fail with HTTP 400. Applied at import time so every worker process patches
+# before serving requests. Remove once the fix lands upstream.
+apply_vertexai_thought_signature_patch()
+
 
 service_name = configuration.configuration.name
 
diff --git a/src/utils/vertexai_thought_signature.py b/src/utils/vertexai_thought_signature.py
@@ -0,0 +1,159 @@
+"""Round-trip Gemini 3 thought signatures through llama-stack's Vertex AI path.
+
+Gemini 3.x models (for example ``gemini-3-flash`` and ``gemini-3.5-flash``)
+attach a ``thought_signature`` to the first ``functionCall`` part of a
+tool-calling turn. The signature MUST be replayed verbatim on the following
+turn or Gemini rejects the request with HTTP 400.
+
+llama-stack converts Gemini responses into the OpenAI chat-completion shape
+before they re-enter its own history, and that shape has no field for a
+thought signature, so the signature is dropped and every multi-turn tool call
+against a Gemini 3 model fails. This module monkeypatches llama-stack's
+``vertexai`` converter so the signature survives the round trip.
+
+Strategy: both patched functions are thin wrappers around the upstream
+originals. We copy none of llama-stack's conversion logic; we only smuggle the
+signature in and out through the opaque tool-call ``id`` (which llama-stack
+round-trips untouched and only ever compares for equality).
+
+- On the way out (Gemini -> OpenAI): ``_extract_candidate_parts`` produces a
+  random tool-call id per ``functionCall`` part. We call the original, then
+  re-walk the candidate's parts in the same deterministic order, pair each
+  ``functionCall`` part with the tool call the original emitted, and rewrite
+  that tool call's id to embed the base64-encoded signature.
+
+- On the way back (OpenAI -> Gemini): ``_convert_assistant_message`` builds
+  the Gemini ``parts``. We call the original, then re-pair each
+  ``function_call`` part with its source tool call (same order) and attach the
+  decoded signature.
+
+This file shadows behaviour tied to a specific llama-stack release. Remove it
+once the upstream Vertex AI provider carries thought signatures natively.
+"""
+
+import base64
+from typing import Any
+
+from log import get_logger
+
+logger = get_logger(__name__)
+
+# Sentinel separating the real tool-call id from a smuggled Gemini
+# thought_signature. Chosen to be vanishingly unlikely in a normal id.
+_THOUGHT_SIG_SEP = "::gts::"
+
+# Set once the patch has been applied so repeated startup calls are no-ops.
+_PATCH_APPLIED = False
+
+
+def _encode_thought_signature_into_id(call_id: str, signature: Any) -> str:
+    """Append a base64-encoded Gemini thought_signature to a tool-call id.
+
+    The signature is bytes; the id must stay a plain string that round-trips
+    through llama-stack history. Returns ``call_id`` unchanged when there is no
+    signature to carry or it cannot be encoded.
+    """
+    if not signature:
+        return call_id
+    try:
+        raw = signature.encode("utf-8") if isinstance(signature, str) else bytes(signature)
+        encoded = base64.b64encode(raw).decode("ascii")
+    except (TypeError, ValueError):
+        return call_id
+    return f"{call_id}{_THOUGHT_SIG_SEP}{encoded}"
+
+
+def _decode_thought_signature_from_id(call_id: str) -> bytes | None:
+    """Recover the thought_signature bytes smuggled into a tool-call id."""
+    if not call_id or _THOUGHT_SIG_SEP not in call_id:
+        return None
+    _, _, encoded = call_id.partition(_THOUGHT_SIG_SEP)
+    try:
+        return base64.b64decode(encoded)
+    except (ValueError, TypeError):
+        return None
+
+
+def _iter_function_call_parts(candidate: Any) -> list[Any]:
+    """Return the ``functionCall`` parts of a Gemini candidate, in order.
+
+    Mirrors the iteration order llama-stack's ``_extract_candidate_parts`` uses
+    so the parts line up one-to-one with the tool calls it produces.
+    """
+    content_obj = getattr(candidate, "content", None)
+    parts = getattr(content_obj, "parts", None) or []
+    fc_parts: list[Any] = []
+    for part in parts:
+        # Thinking parts and text parts are skipped before the function-call
+        # branch upstream; replicate that ordering precisely.
+        if getattr(part, "thought", None):
+            continue
+        if getattr(part, "text", None) is not None:
+            continue
+        if getattr(part, "function_call", None) is not None:
+            fc_parts.append(part)
+    return fc_parts
+
+
+def apply_patch() -> bool:
+    """Monkeypatch the Vertex AI converter to carry Gemini thought signatures.
+
+    Idempotent. Returns ``True`` if the patch is in effect after the call,
+    ``False`` if the converter module could not be imported (for example when
+    the Vertex AI provider is not installed), in which case nothing is changed.
+    """
+    global _PATCH_APPLIED
+    if _PATCH_APPLIED:
+        return True
+
+    try:
+        from llama_stack.providers.remote.inference.vertexai import (
+            converters,
+        )
+    except ImportError:
+        logger.info(
+            "Vertex AI converter not importable; skipping Gemini thought-signature patch"
+        )
+        return False
+
+    original_extract = converters._extract_candidate_parts
+    original_convert_assistant = converters._convert_assistant_message
+
+    def patched_extract_candidate_parts(candidate: Any) -> Any:
+        text_parts, thinking_parts, tool_calls = original_extract(candidate)
+        if not tool_calls:
+            return text_parts, thinking_parts, tool_calls
+        fc_parts = _iter_function_call_parts(candidate)
+        # The original emits exactly one tool call per function-call part, in
+        # the same order. Pair them and embed any signature into the id.
+        for tool_call, part in zip(tool_calls, fc_parts):
+            signature = getattr(part, "thought_signature", None)
+            if not signature:
+                continue
+            tool_call.id = _encode_thought_signature_into_id(tool_call.id, signature)
+        return text_parts, thinking_parts, tool_calls
+
+    def patched_convert_assistant_message(msg: dict[str, Any]) -> dict[str, Any] | None:
+        result = original_convert_assistant(msg)
+        if result is None:
+            return None
+        tool_calls = msg.get("tool_calls") or []
+        if not tool_calls:
+            return result
+        # Re-pair each Gemini function_call part with its source tool call, in
+        # order, and attach the decoded signature. The original appends one
+        # function_call part per tool call after any leading text part, so we
+        # walk the function_call parts and the tool calls together.
+        fc_parts = [p for p in result.get("parts", []) if "function_call" in p]
+        for part, tool_call in zip(fc_parts, tool_calls):
+            call_id = converters._to_dict(tool_call).get("id", "")
+            signature = _decode_thought_signature_from_id(call_id)
+            if signature is not None:
+                part["thought_signature"] = signature
+        return result
+
+    converters._extract_candidate_parts = patched_extract_candidate_parts
+    converters._convert_assistant_message = patched_convert_assistant_message
+    _PATCH_APPLIED = True
+    logger.info("Applied Gemini 3 thought-signature patch to Vertex AI converter")
+    return True
diff --git a/tests/unit/utils/test_vertexai_thought_signature.py b/tests/unit/utils/test_vertexai_thought_signature.py
@@ -0,0 +1,142 @@
+"""Unit tests for the Gemini 3 thought-signature Vertex AI converter patch."""
+
+from types import SimpleNamespace
+
+import pytest
+
+from utils import vertexai_thought_signature as patch
+
+
+@pytest.fixture(autouse=True)
+def _reset_patch_state(monkeypatch):
+    """Each test starts with the apply-once guard cleared."""
+    monkeypatch.setattr(patch, "_PATCH_APPLIED", False)
+    yield
+
+
+class TestEncodeDecode:
+    """The base64 smuggling helpers round-trip signatures through the id."""
+
+    def test_round_trip_bytes(self):
+        sig = b"\x01\x02\xfe\xffsig"
+        encoded = patch._encode_thought_signature_into_id("call_x", sig)
+        assert encoded.startswith("call_x")
+        assert patch._THOUGHT_SIG_SEP in encoded
+        assert patch._decode_thought_signature_from_id(encoded) == sig
+
+    def test_round_trip_str_signature(self):
+        encoded = patch._encode_thought_signature_into_id("call_x", "abc")
+        assert patch._decode_thought_signature_from_id(encoded) == b"abc"
+
+    def test_no_signature_leaves_id_untouched(self):
+        assert patch._encode_thought_signature_into_id("call_x", None) == "call_x"
+        assert patch._encode_thought_signature_into_id("call_x", b"") == "call_x"
+
+    def test_plain_id_decodes_to_none(self):
+        assert patch._decode_thought_signature_from_id("call_x") is None
+        assert patch._decode_thought_signature_from_id("") is None
+
+    def test_corrupt_payload_decodes_to_none(self):
+        corrupt = f"call_x{patch._THOUGHT_SIG_SEP}!!!not-base64!!!"
+        assert patch._decode_thought_signature_from_id(corrupt) is None
+
+
+def _make_part(**kw):
+    base = {"thought": None, "text": None, "function_call": None, "thought_signature": None}
+    base.update(kw)
+    return SimpleNamespace(**base)
+
+
+def _make_candidate(parts):
+    return SimpleNamespace(content=SimpleNamespace(parts=parts), finish_reason=None)
+
+
+class TestApplyPatch:
+    """apply_patch wires the wrappers onto the real converter and is idempotent."""
+
+    def test_apply_is_idempotent(self):
+        converters = pytest.importorskip(
+            "llama_stack.providers.remote.inference.vertexai.converters"
+        )
+        assert patch.apply_patch() is True
+        first = converters._extract_candidate_parts
+        assert patch.apply_patch() is True
+        assert converters._extract_candidate_parts is first
+
+    def test_apply_missing_provider_returns_false(self, monkeypatch):
+        import builtins
+
+        real_import = builtins.__import__
+
+        def fake_import(name, *args, **kwargs):
+            if name.startswith("llama_stack.providers.remote.inference.vertexai"):
+                raise ImportError("provider not installed")
+            return real_import(name, *args, **kwargs)
+
+        monkeypatch.setattr(builtins, "__import__", fake_import)
+        assert patch.apply_patch() is False
+
+    def test_extract_embeds_signature_into_tool_call_id(self):
+        converters = pytest.importorskip(
+            "llama_stack.providers.remote.inference.vertexai.converters"
+        )
+        patch.apply_patch()
+        sig = b"\x10\x20signature"
+        fc = SimpleNamespace(name="search_portal", args={"q": "selinux"})
+        cand = _make_candidate(
+            [
+                _make_part(text="thinking"),
+                _make_part(function_call=fc, thought_signature=sig),
+            ]
+        )
+        _text, _thinking, tool_calls = converters._extract_candidate_parts(cand)
+        assert len(tool_calls) == 1
+        # name stays clean, signature rides on the id
+        assert tool_calls[0].function.name == "search_portal"
+        assert patch._decode_thought_signature_from_id(tool_calls[0].id) == sig
+
+    def test_round_trip_through_assistant_message(self):
+        converters = pytest.importorskip(
+            "llama_stack.providers.remote.inference.vertexai.converters"
+        )
+        patch.apply_patch()
+        sig = b"round-trip-bytes"
+        fc = SimpleNamespace(name="get_document", args={"id": "1"})
+        cand = _make_candidate([_make_part(function_call=fc, thought_signature=sig)])
+        _t, _th, tool_calls = converters._extract_candidate_parts(cand)
+
+        msg = {
+            "role": "assistant",
+            "content": None,
+            "tool_calls": [
+                {
+                    "id": tool_calls[0].id,
+                    "type": "function",
+                    "function": {"name": "get_document", "arguments": '{"id":"1"}'},
+                }
+            ],
+        }
+        out = converters._convert_assistant_message(msg)
+        fc_parts = [p for p in out["parts"] if "function_call" in p]
+        assert len(fc_parts) == 1
+        assert fc_parts[0]["thought_signature"] == sig
+
+    def test_tool_call_without_signature_stays_clean(self):
+        converters = pytest.importorskip(
+            "llama_stack.providers.remote.inference.vertexai.converters"
+        )
+        patch.apply_patch()
+        msg = {
+            "role": "assistant",
+            "content": None,
+            "tool_calls": [
+                {
+                    "id": "call_plain",
+                    "type": "function",
+                    "function": {"name": "x", "arguments": "{}"},
+                }
+            ],
+        }
+        out = converters._convert_assistant_message(msg)
+        fc_part = next(p for p in out["parts"] if "function_call" in p)
+        assert "thought_signature" not in fc_part