feat(dreamer): allow per-specialist provider and thinking budget overrides

offendingcommit · offendingcommit · commit c799a2ca1946 · 2026-04-24T11:48:08.000-05:00
Adds DEDUCTION_PROVIDER/INDUCTION_PROVIDER and matching THINKING_BUDGET_TOKENS
settings so deduction and induction specialists can route to a different
provider than the main DREAM config. Also propagates thinking_budget_tokens
into the LLM call and documents the CF gateway / Gemini thought_signature
gotchas in CLAUDE.md.
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -84,6 +84,13 @@ All API routes follow the pattern: `/v1/{resource}/{id}/{action}`
 - Typechecking: `uv run basedpyright`
 - Format code: `uv run ruff format src/`
 
+### LLM provider gotchas (learned 2026-04-16 in k8s deploy)
+
+- **Structured outputs (`response_format={"type": "json_schema"}`) only work on providers whose upstream API natively honors them.** Google Gemini does (route via `cf` provider with base_url ending in `/openai`). Ollama Cloud (reached via the `custom` provider + `custom-ollama` CF gateway endpoint, or any direct Ollama endpoint) does **not** translate `response_format` into Ollama's native JSON-mode — every Ollama Cloud model (GLM-5.1, nemotron-3-nano, qwen3.5, devstral-small-2 confirmed) returns free-form text/markdown when a schema is requested, and `honcho_llm_call` bubbles a `ValidationError: Invalid JSON` out of pydantic parsing.
+- **Therefore: deriver (`src/deriver/deriver.py:126`) and summary (`src/utils/summarizer.py`) must stay on a Gemini-backed `cf` provider.** Dream, dialectic, and any free-form / tool-call path is free to use the `custom` provider.
+- **Gemini `thoughtSignature` round-tripping breaks on the CF `openai`-compat route.** Any call with `maxToolIterations > 1` AND `thinkingBudgetTokens > 0` will return `400 Function call is missing a thought_signature` on iteration 2+. If you need thinking on a multi-iteration tool loop, use the native Gemini provider, not the OpenAI-compat route — or set `thinkingBudgetTokens=0`.
+- **None of this is Cloudflare's fault.** CF AI Gateway is a transparent proxy in both the `openai` and `custom-ollama` routes. The limitations live at the upstream provider (Ollama Cloud's OpenAI-compat layer).
+
 ### Local LM Studio Setup
 
 - Honcho can use LM Studio for generation through the `custom` provider path.
diff --git a/src/config.py b/src/config.py
@@ -561,12 +561,14 @@ class DreamSettings(BackupLLMSettingsMixin, HonchoSettings):
         16_384
     )
 
-    ## NOTE: specialist models use the same provider as the main model
-
     # Deduction Specialist: handles logical inference
     DEDUCTION_MODEL: str = "claude-haiku-4-5"
+    DEDUCTION_PROVIDER: SupportedProviders | None = None  # falls back to PROVIDER
+    DEDUCTION_THINKING_BUDGET_TOKENS: int | None = None  # falls back to THINKING_BUDGET_TOKENS
     # Induction Specialist: identifies patterns across observations
     INDUCTION_MODEL: str = "claude-haiku-4-5"
+    INDUCTION_PROVIDER: SupportedProviders | None = None  # falls back to PROVIDER
+    INDUCTION_THINKING_BUDGET_TOKENS: int | None = None  # falls back to THINKING_BUDGET_TOKENS
 
     # Surprisal-based sampling subsystem
     SURPRISAL: SurprisalSettings = Field(default_factory=SurprisalSettings)
diff --git a/src/dreamer/specialists.py b/src/dreamer/specialists.py
@@ -74,6 +74,14 @@ def get_model(self) -> str:
         """Get the model to use for this specialist."""
         ...
 
+    def get_provider(self) -> str | None:
+        """Get the provider override for this specialist, or None to inherit from DREAM."""
+        return None
+
+    def get_thinking_budget(self) -> int | None:
+        """Get the thinking budget override, or None to inherit from DREAM."""
+        return None
+
     def get_max_tokens(self) -> int:
         """Get max output tokens for this specialist."""
         return 16384
@@ -196,9 +204,16 @@ async def run(
             parent_category="dream",
         )
 
-        # Get model with potential override
+        # Get model, provider, and thinking budget with potential overrides
         model = self.get_model()
-        llm_settings = settings.DREAM.model_copy(update={"MODEL": model})
+        provider = self.get_provider()
+        thinking_budget = self.get_thinking_budget()
+        overrides: dict[str, Any] = {"MODEL": model}
+        if provider is not None:
+            overrides["PROVIDER"] = provider
+        if thinking_budget is not None:
+            overrides["THINKING_BUDGET_TOKENS"] = thinking_budget
+        llm_settings = settings.DREAM.model_copy(update=overrides)
 
         # Track iterations via callback
         iteration_count = 0
@@ -219,6 +234,7 @@ def iteration_callback(data: Any) -> None:
             messages=messages,
             track_name=f"Dreamer/{self.name}",
             iteration_callback=iteration_callback,
+            thinking_budget_tokens=llm_settings.THINKING_BUDGET_TOKENS,
         )
 
         # Log metrics
@@ -308,6 +324,12 @@ def get_tools(self, *, peer_card_enabled: bool = True) -> list[dict[str, Any]]:
     def get_model(self) -> str:
         return settings.DREAM.DEDUCTION_MODEL
 
+    def get_provider(self) -> str | None:
+        return settings.DREAM.DEDUCTION_PROVIDER
+
+    def get_thinking_budget(self) -> int | None:
+        return settings.DREAM.DEDUCTION_THINKING_BUDGET_TOKENS
+
     def get_max_tokens(self) -> int:
         return 8192
 
@@ -451,6 +473,12 @@ def get_tools(self, *, peer_card_enabled: bool = True) -> list[dict[str, Any]]:
     def get_model(self) -> str:
         return settings.DREAM.INDUCTION_MODEL
 
+    def get_provider(self) -> str | None:
+        return settings.DREAM.INDUCTION_PROVIDER
+
+    def get_thinking_budget(self) -> int | None:
+        return settings.DREAM.INDUCTION_THINKING_BUDGET_TOKENS
+
     def get_max_tokens(self) -> int:
         return 8192