Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ dependencies = [
"prometheus_client>=0.21.0",
"cloudevents>=1.12.0",
]

[project.optional-dependencies]
litellm = ["litellm>=1.65,<1.85"]

[dependency-groups]
dev = [
"pytest>=8.2.2",
Expand Down
2 changes: 1 addition & 1 deletion src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

logger = logging.getLogger(__name__)

ModelTransport = Literal["anthropic", "openai", "gemini"]
ModelTransport = Literal["anthropic", "openai", "gemini", "litellm"]
EmbeddingTransport = Literal["openai", "gemini"]


Expand Down
2 changes: 2 additions & 0 deletions src/llm/backends/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from .anthropic import AnthropicBackend
from .gemini import GeminiBackend
from .litellm import LiteLLMBackend
from .openai import OpenAIBackend

__all__ = [
"AnthropicBackend",
"GeminiBackend",
"LiteLLMBackend",
"OpenAIBackend",
]
223 changes: 223 additions & 0 deletions src/llm/backends/litellm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
"""LiteLLM provider backend.

Routes to 100+ LLM providers via a unified interface using provider-prefixed
model names (e.g. ``anthropic/claude-sonnet-4-6``, ``gemini/gemini-2.5-flash``).

Install: ``pip install litellm``
"""

from __future__ import annotations

import json
import logging
from collections.abc import AsyncIterator
from typing import Any

from pydantic import BaseModel

from src.exceptions import ValidationException
from src.llm.backend import CompletionResult, StreamChunk, ToolCallResult

logger = logging.getLogger(__name__)


class LiteLLMBackend:
"""Provider backend wrapping litellm.acompletion."""

def __init__(self, api_key: str | None = None, api_base: str | None = None) -> None:
self._api_key = api_key
self._api_base = api_base

def _base_kwargs(self) -> dict[str, Any]:
kwargs: dict[str, Any] = {"drop_params": True}
if self._api_key:
kwargs["api_key"] = self._api_key
if self._api_base:
kwargs["api_base"] = self._api_base
return kwargs

@staticmethod
def _import_litellm() -> Any:
try:
import litellm
except ModuleNotFoundError as exc:
raise ValidationException(
"LiteLLM transport requires optional dependency 'litellm'. "
"Install with: pip install honcho[litellm]"
) from exc
return litellm

async def complete(
self,
*,
model: str,
messages: list[dict[str, Any]],
max_tokens: int,
temperature: float | None = None,
stop: list[str] | None = None,
tools: list[dict[str, Any]] | None = None,
tool_choice: str | dict[str, Any] | None = None,
response_format: type[BaseModel] | dict[str, Any] | None = None,
thinking_budget_tokens: int | None = None,
thinking_effort: str | None = None,
max_output_tokens: int | None = None,
extra_params: dict[str, Any] | None = None,
) -> CompletionResult:
litellm = self._import_litellm()

params = self._build_params(
model=model,
messages=messages,
max_tokens=max_output_tokens or max_tokens,
temperature=temperature,
stop=stop,
tools=tools,
tool_choice=tool_choice,
response_format=response_format,
thinking_effort=thinking_effort,
extra_params=extra_params,
)
Comment on lines +61 to +79
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

🧩 Analysis chain

🌐 Web query:

What is the exact parameter name and shape for setting an Anthropic thinking budget through litellm.acompletion in litellm 1.65?

💡 Result:

In liteLLM 1.65, when you use litellm.completion(...)/litellm.acompletion(...) with Anthropic extended thinking, the exact parameter is named thinking and its shape is: thinking={"type": "enabled", "budget_tokens": } Example: response = litellm.acompletion( model="anthropic/claude-sonnet-4-6", messages=[{"role": "user", "content": "…"}], thinking={"type": "enabled", "budget_tokens": 1024},) liteLLM also documents an OpenAI-style alternative mapping: reasoning_effort is mapped to Anthropic output_config.effort (not budget_tokens) for Claude 4.6/4.7 models. If you specifically need a token budget for thinking, use the thinking parameter above, not reasoning_effort. [1][2][3]

Citations:


🏁 Script executed:

# First, let's examine the litellm.py file to understand its structure
wc -l src/llm/backends/litellm.py

Repository: plastic-labs/honcho

Length of output: 95


🏁 Script executed:

# Read the file with line numbers to see the relevant sections
cat -n src/llm/backends/litellm.py | sed -n '50,180p'

Repository: plastic-labs/honcho

Length of output: 5776


🏁 Script executed:

# Also search for thinking_budget_tokens usage in the file
rg 'thinking_budget_tokens' src/llm/backends/litellm.py -n

Repository: plastic-labs/honcho

Length of output: 171


🏁 Script executed:

# Search for _build_params method to see what parameters it handles
rg '_build_params' src/llm/backends/litellm.py -A 30 -n

Repository: plastic-labs/honcho

Length of output: 3803


Forward thinking_budget_tokens parameter or document that it's ignored.

complete() and stream() accept thinking_budget_tokens but never forward it to _build_params() or use it. LiteLLM requires thinking={"type": "enabled", "budget_tokens": ...} for Anthropic extended thinking with token budgets. Without plumbing this parameter, callers will silently get incorrect behavior. Either construct and pass the thinking parameter based on thinking_budget_tokens, or explicitly log/document that this backend ignores it.

Also applies to: stream() (lines 95–113) and _build_params() (lines 135–176)

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/llm/backends/litellm.py` around lines 61 - 79, complete() and stream()
accept thinking_budget_tokens but never forward it to _build_params() or
construct the LiteLLM "thinking" dict, so Anthropic-style token budgets are
ignored; update complete(), stream(), and _build_params() to accept/handle a
thinking parameter by: when thinking_budget_tokens is not None build a thinking
dict like {"type":"enabled","budget_tokens": thinking_budget_tokens} (optionally
include thinking_effort if present) and pass it into _build_params() (or have
_build_params() accept thinking_budget_tokens and assemble the thinking dict
there), ensuring the final params sent to LiteLLM include this thinking field;
alternatively, if you choose not to support it, add an explicit
log/documentation in complete()/stream() and _build_params() stating
thinking_budget_tokens is ignored.


response = await litellm.acompletion(**params)
return self._normalize_response(response)

async def stream(
self,
*,
model: str,
messages: list[dict[str, Any]],
max_tokens: int,
temperature: float | None = None,
stop: list[str] | None = None,
tools: list[dict[str, Any]] | None = None,
tool_choice: str | dict[str, Any] | None = None,
response_format: type[BaseModel] | dict[str, Any] | None = None,
thinking_budget_tokens: int | None = None,
thinking_effort: str | None = None,
max_output_tokens: int | None = None,
extra_params: dict[str, Any] | None = None,
) -> AsyncIterator[StreamChunk]:
litellm = self._import_litellm()

params = self._build_params(
model=model,
messages=messages,
max_tokens=max_output_tokens or max_tokens,
temperature=temperature,
stop=stop,
tools=tools,
tool_choice=tool_choice,
response_format=response_format,
thinking_effort=thinking_effort,
extra_params=extra_params,
)
params["stream"] = True

response_stream = await litellm.acompletion(**params)
finish_reason: str | None = None
async for chunk in response_stream:
if chunk.choices and chunk.choices[0].delta.content:
yield StreamChunk(content=chunk.choices[0].delta.content)
if chunk.choices and chunk.choices[0].finish_reason:
finish_reason = chunk.choices[0].finish_reason
usage = getattr(chunk, "usage", None)
if usage:
yield StreamChunk(
is_done=True,
finish_reason=finish_reason,
output_tokens=getattr(usage, "completion_tokens", None),
)
return

if finish_reason:
yield StreamChunk(is_done=True, finish_reason=finish_reason)

def _build_params(
self,
*,
model: str,
messages: list[dict[str, Any]],
max_tokens: int,
temperature: float | None,
stop: list[str] | None,
tools: list[dict[str, Any]] | None,
tool_choice: str | dict[str, Any] | None,
response_format: type[BaseModel] | dict[str, Any] | None,
thinking_effort: str | None,
extra_params: dict[str, Any] | None,
) -> dict[str, Any]:
params: dict[str, Any] = {
"model": model,
"messages": messages,
"max_tokens": max_tokens,
**self._base_kwargs(),
}
if temperature is not None:
params["temperature"] = temperature
if stop:
params["stop"] = stop
if tools:
params["tools"] = self._convert_tools(tools)
if tool_choice is not None:
params["tool_choice"] = tool_choice
if response_format is not None:
if isinstance(response_format, type) and issubclass(
response_format, BaseModel
):
params["response_format"] = response_format
else:
params["response_format"] = response_format
if thinking_effort:
params["reasoning_effort"] = thinking_effort
if extra_params:
for key in ("top_p", "frequency_penalty", "presence_penalty", "seed"):
if key in extra_params:
params[key] = extra_params[key]
return params

@staticmethod
def _normalize_response(response: Any) -> CompletionResult:
usage = getattr(response, "usage", None)
message = response.choices[0].message
finish_reason = response.choices[0].finish_reason

tool_calls: list[ToolCallResult] = []
for tc in getattr(message, "tool_calls", None) or []:
tool_input: dict[str, Any] = {}
if tc.function.arguments:
try:
tool_input = json.loads(tc.function.arguments)
except (json.JSONDecodeError, TypeError):
logger.warning(
"Malformed tool arguments for %s (id=%s)",
tc.function.name,
tc.id,
)
tool_calls.append(
ToolCallResult(id=tc.id, name=tc.function.name, input=tool_input)
)

return CompletionResult(
content=getattr(message, "content", "") or "",
input_tokens=getattr(usage, "prompt_tokens", 0) if usage else 0,
output_tokens=getattr(usage, "completion_tokens", 0) if usage else 0,
finish_reason=finish_reason or "stop",
tool_calls=tool_calls,
raw_response=response,
)

@staticmethod
def _convert_tools(tools: list[dict[str, Any]]) -> list[dict[str, Any]]:
if not tools or tools[0].get("type") == "function":
return tools
return [
{
"type": "function",
"function": {
"name": tool["name"],
"description": tool["description"],
"parameters": tool["input_schema"],
},
}
for tool in tools
]
15 changes: 11 additions & 4 deletions src/llm/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from .backend import ProviderBackend
from .backends.anthropic import AnthropicBackend
from .backends.gemini import GeminiBackend
from .backends.litellm import LiteLLMBackend
from .backends.openai import OpenAIBackend
from .credentials import default_transport_api_key
from .history_adapters import (
Expand Down Expand Up @@ -108,7 +109,7 @@ def get_gemini_override_client(
def client_for_model_config(
provider: ModelTransport,
model_config: ModelConfig,
) -> ProviderClient:
) -> ProviderClient | None:
"""Resolve the provider client for a ModelConfig.

Fast path: no overrides → reuse the module-level default client from
Expand All @@ -131,12 +132,14 @@ def client_for_model_config(
return get_openai_override_client(base_url, api_key)
if provider == "gemini":
return get_gemini_override_client(base_url, api_key)
if provider == "litellm":
return None # LiteLLMBackend manages its own credentials
assert_never(provider)
Comment thread
coderabbitai[bot] marked this conversation as resolved.


def backend_for_provider(
provider: ModelTransport,
client: ProviderClient,
client: ProviderClient | None,
) -> ProviderBackend:
"""Wrap a raw provider SDK client in the matching ProviderBackend adapter."""
if provider == "anthropic":
Expand All @@ -145,6 +148,8 @@ def backend_for_provider(
return OpenAIBackend(client)
if provider == "gemini":
return GeminiBackend(client)
if provider == "litellm":
return LiteLLMBackend()
assert_never(provider)


Expand All @@ -154,18 +159,20 @@ def history_adapter_for_provider(provider: ModelTransport) -> HistoryAdapter:
return AnthropicHistoryAdapter()
if provider == "gemini":
return GeminiHistoryAdapter()
return OpenAIHistoryAdapter()
return OpenAIHistoryAdapter() # litellm uses OpenAI message format


def get_backend(config: ModelConfig) -> ProviderBackend:
"""High-level one-shot backend factory: ModelConfig ProviderBackend.
"""High-level one-shot backend factory: ModelConfig -> ProviderBackend.

Delegates client resolution to ``client_for_model_config``, which owns
the CLIENTS fast-path and the missing-API-key validation. Both the
production path (via ``honcho_llm_call_inner``) and the live-test path
(via this function) now construct clients through the same helper, so
validation behavior stays consistent.
"""
if config.transport == "litellm":
return LiteLLMBackend(api_key=config.api_key, api_base=config.base_url)
client = client_for_model_config(config.transport, config)
return backend_for_provider(config.transport, client)

Expand Down
2 changes: 1 addition & 1 deletion src/llm/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class AttemptPlan:

provider: ModelTransport
model: str
client: ProviderClient
client: ProviderClient | None
thinking_budget_tokens: int | None
reasoning_effort: ReasoningEffortType
selected_config: ModelConfig
Expand Down
Loading