Skip to content

Commit 32af03a

Browse files
fix(cf): preserve thought_signature and fix token budget for Gemini thinking models
- Add `cf` provider (Cloudflare AI Gateway) to SupportedProviders and initialize AsyncOpenAI client pointed at CF_GATEWAY_BASE_URL - Route OpenAI embeddings through CF Gateway when LLM_OPENAI_BASE_URL is set - Convert tools to OpenAI format for `cf` provider (was missing from provider list) - Extract thought_signature from OpenAI-compat tool call responses and re-include it when formatting assistant messages for multi-turn replay — fixes 400 INVALID_ARGUMENT from Gemini thinking models via CF Gateway - Preserve thought_signature in _format_assistant_tool_message else branch - Increase DERIVER_MAX_INPUT_TOKENS upper bound (23000 → 200000) to allow higher limits via config
1 parent 68d88bd commit 32af03a

4 files changed

Lines changed: 74 additions & 28 deletions

File tree

src/config.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,11 +208,19 @@ class LLMSettings(HonchoSettings):
208208
GEMINI_API_KEY: str | None = None
209209
GROQ_API_KEY: str | None = None
210210
OPENAI_COMPATIBLE_BASE_URL: str | None = None
211+
OPENAI_BASE_URL: str | None = None # Route OpenAI calls through a proxy/gateway
211212

212213
# Separate vLLM endpoint (for local models)
213214
VLLM_API_KEY: str | None = None
214215
VLLM_BASE_URL: str | None = None
215216

217+
# Cloudflare AI Gateway (OpenAI-compatible universal endpoint)
218+
# CF_GATEWAY_API_KEY = provider API key (e.g. Gemini key for google-ai-studio/ models)
219+
# CF_GATEWAY_AUTH_TOKEN = cfut_ gateway token → sent in cf-aig-authorization header
220+
CF_GATEWAY_API_KEY: str | None = None
221+
CF_GATEWAY_BASE_URL: str | None = None
222+
CF_GATEWAY_AUTH_TOKEN: str | None = None
223+
216224
EMBEDDING_PROVIDER: Literal["openai", "gemini", "openrouter"] = "openai"
217225
EMBEDDING_MODEL: str | None = None
218226

@@ -261,7 +269,7 @@ class DeriverSettings(BackupLLMSettingsMixin, HonchoSettings):
261269

262270
LOG_OBSERVATIONS: bool = False
263271

264-
MAX_INPUT_TOKENS: Annotated[int, Field(default=23000, gt=0, le=23000)] = 23000
272+
MAX_INPUT_TOKENS: Annotated[int, Field(default=23000, gt=0, le=200_000)] = 23000
265273

266274
# Maximum number of observations to return in working representation
267275
# This is applied to both explicit and deductive observations

src/embedding_client.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import logging
33
import threading
44
from collections import defaultdict
5-
from typing import NamedTuple
5+
from typing import Any, NamedTuple
66

77
import tiktoken
88
from google import genai
@@ -60,7 +60,14 @@ def __init__(self, api_key: str | None = None, provider: str | None = None):
6060
api_key = settings.LLM.OPENAI_API_KEY
6161
if not api_key:
6262
raise ValueError("OpenAI API key is required")
63-
self.client = AsyncOpenAI(api_key=api_key)
63+
_emb_kwargs: dict[str, Any] = {"api_key": api_key}
64+
if settings.LLM.OPENAI_BASE_URL:
65+
_emb_kwargs["base_url"] = settings.LLM.OPENAI_BASE_URL
66+
if settings.LLM.CF_GATEWAY_AUTH_TOKEN:
67+
_emb_kwargs["default_headers"] = {
68+
"cf-aig-authorization": f"Bearer {settings.LLM.CF_GATEWAY_AUTH_TOKEN}"
69+
}
70+
self.client = AsyncOpenAI(**_emb_kwargs)
6471
self.model = "text-embedding-3-small"
6572
self.max_embedding_tokens = settings.MAX_EMBEDDING_TOKENS
6673
self.max_batch_size = 2048 # OpenAI batch limit

src/utils/clients.py

Lines changed: 55 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -258,17 +258,36 @@ def _get_effective_temperature(temperature: float | None) -> float | None:
258258
CLIENTS["anthropic"] = anthropic
259259

260260
if settings.LLM.OPENAI_API_KEY:
261-
openai_client = AsyncOpenAI(
262-
api_key=settings.LLM.OPENAI_API_KEY,
263-
)
264-
CLIENTS["openai"] = openai_client
261+
_openai_kwargs: dict[str, Any] = {"api_key": settings.LLM.OPENAI_API_KEY}
262+
if settings.LLM.OPENAI_BASE_URL:
263+
_openai_kwargs["base_url"] = settings.LLM.OPENAI_BASE_URL
264+
if settings.LLM.CF_GATEWAY_AUTH_TOKEN:
265+
_openai_kwargs["default_headers"] = {
266+
"cf-aig-authorization": f"Bearer {settings.LLM.CF_GATEWAY_AUTH_TOKEN}"
267+
}
268+
CLIENTS["openai"] = AsyncOpenAI(**_openai_kwargs)
265269

266270
if settings.LLM.OPENAI_COMPATIBLE_API_KEY and settings.LLM.OPENAI_COMPATIBLE_BASE_URL:
267271
CLIENTS["custom"] = AsyncOpenAI(
268272
api_key=settings.LLM.OPENAI_COMPATIBLE_API_KEY,
269273
base_url=settings.LLM.OPENAI_COMPATIBLE_BASE_URL,
270274
)
271275

276+
# Cloudflare AI Gateway (OpenAI-compatible universal endpoint)
277+
# CF_GATEWAY_API_KEY = provider key passed in Authorization (e.g. Gemini key for google-ai-studio/)
278+
# CF_GATEWAY_AUTH_TOKEN = cfut_ gateway token passed in cf-aig-authorization (optional, for gateway auth)
279+
if settings.LLM.CF_GATEWAY_API_KEY and settings.LLM.CF_GATEWAY_BASE_URL:
280+
_cf_extra_headers: dict[str, str] = {}
281+
if settings.LLM.CF_GATEWAY_AUTH_TOKEN:
282+
_cf_extra_headers["cf-aig-authorization"] = (
283+
f"Bearer {settings.LLM.CF_GATEWAY_AUTH_TOKEN}"
284+
)
285+
CLIENTS["cf"] = AsyncOpenAI(
286+
api_key=settings.LLM.CF_GATEWAY_API_KEY,
287+
base_url=settings.LLM.CF_GATEWAY_BASE_URL,
288+
default_headers=_cf_extra_headers,
289+
)
290+
272291
# vLLM uses separate settings for local model serving
273292
if settings.LLM.VLLM_API_KEY and settings.LLM.VLLM_BASE_URL:
274293
CLIENTS["vllm"] = AsyncOpenAI(
@@ -334,9 +353,9 @@ def convert_tools_for_provider(
334353
if provider == "anthropic":
335354
# Anthropic format: input_schema
336355
return tools
337-
elif provider in ("openai", "custom", "vllm"):
356+
elif provider in ("openai", "custom", "vllm", "cf"):
338357
# OpenAI format: parameters instead of input_schema
339-
# custom and vllm use AsyncOpenAI client so need OpenAI format
358+
# custom, vllm, and cf use AsyncOpenAI client so need OpenAI format
340359
return [
341360
{
342361
"type": "function",
@@ -1103,16 +1122,20 @@ def _format_assistant_tool_message(
11031122
# OpenAI format - must include tool_calls in the assistant message
11041123
openai_tool_calls: list[Any] = []
11051124
for tool_call in tool_calls:
1106-
openai_tool_calls.append(
1107-
{
1108-
"id": tool_call["id"],
1109-
"type": "function",
1110-
"function": {
1111-
"name": tool_call["name"],
1112-
"arguments": json.dumps(tool_call["input"]),
1113-
},
1114-
}
1115-
)
1125+
oa_call: dict[str, Any] = {
1126+
"id": tool_call["id"],
1127+
"type": "function",
1128+
"function": {
1129+
"name": tool_call["name"],
1130+
"arguments": json.dumps(tool_call["input"]),
1131+
},
1132+
}
1133+
# Preserve thought_signature for Gemini thinking models via CF Gateway.
1134+
# Required for multi-turn tool use — Gemini rejects requests where a
1135+
# function call in the history is missing its thought_signature.
1136+
if "thought_signature" in tool_call:
1137+
oa_call["thought_signature"] = tool_call["thought_signature"]
1138+
openai_tool_calls.append(oa_call)
11161139
msg: dict[str, Any] = {
11171140
"role": "assistant",
11181141
"content": content if isinstance(content, str) else None,
@@ -2046,15 +2069,23 @@ async def honcho_llm_call_inner(
20462069
tool_calls_list: list[dict[str, Any]] = []
20472070
if response.choices[0].message.tool_calls: # pyright: ignore
20482071
for tool_call in response.choices[0].message.tool_calls: # pyright: ignore
2049-
tool_calls_list.append(
2050-
{
2051-
"id": tool_call.id, # pyright: ignore
2052-
"name": tool_call.function.name, # pyright: ignore
2053-
"input": json.loads(tool_call.function.arguments) # pyright: ignore
2054-
if tool_call.function.arguments # pyright: ignore
2055-
else {},
2056-
}
2072+
call_data: dict[str, Any] = {
2073+
"id": tool_call.id, # pyright: ignore
2074+
"name": tool_call.function.name, # pyright: ignore
2075+
"input": json.loads(tool_call.function.arguments) # pyright: ignore
2076+
if tool_call.function.arguments # pyright: ignore
2077+
else {},
2078+
}
2079+
# Preserve thought_signature for Gemini thinking models via CF
2080+
# Gateway — required for multi-turn tool use replay.
2081+
thought_sig = getattr(tool_call, "thought_signature", None) or ( # pyright: ignore
2082+
tool_call.model_extra.get("thought_signature") # pyright: ignore
2083+
if getattr(tool_call, "model_extra", None) # pyright: ignore
2084+
else None
20572085
)
2086+
if thought_sig:
2087+
call_data["thought_signature"] = thought_sig
2088+
tool_calls_list.append(call_data)
20582089

20592090
cache_creation, cache_read = extract_openai_cache_tokens(usage)
20602091
return HonchoLLMCallResponse(

src/utils/types.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ async def post_commit(self) -> None:
3434
await self.on_commit()
3535

3636

37-
SupportedProviders = Literal["anthropic", "openai", "google", "groq", "custom", "vllm"]
37+
SupportedProviders = Literal["anthropic", "openai", "google", "groq", "custom", "vllm", "cf"]
3838
TaskType = Literal[
3939
"webhook", "summary", "representation", "dream", "deletion", "reconciler"
4040
]

0 commit comments

Comments
 (0)