Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from hindsight_api.config import DEFAULT_LLM_TIMEOUT, ENV_LLM_TIMEOUT
from hindsight_api.engine.llm_interface import LLMInterface, OutputTooLongError
from hindsight_api.engine.llm_trace import LLMResponseUsage, stash_response_usage
from hindsight_api.engine.providers.model_capabilities import supports_openai_compatible_reasoning
from hindsight_api.engine.response_models import LLMToolCall, LLMToolCallResult, TokenUsage
from hindsight_api.metrics import get_metrics_collector
from hindsight_api.worker.stage import set_stage
Expand Down Expand Up @@ -159,12 +160,18 @@ def _build_common_kwargs(
if self._default_headers:
kwargs.setdefault("extra_headers", dict(self._default_headers))

if self._should_omit_temperature():
kwargs.pop("temperature", None)

# Bedrock service tier: flex (50% cheaper), priority, or reserved
if self.model.startswith("bedrock/") and self.bedrock_service_tier is not None:
kwargs["service_tier"] = self.bedrock_service_tier

return kwargs

def _should_omit_temperature(self) -> bool:
return supports_openai_compatible_reasoning(self.model)

# ── per-model output-tokens cap (shared with Router subclass) ────────────
# Hindsight's defaults (e.g. retain_max_completion_tokens=64000) target
# high-capacity models. When a configured deployment supports fewer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from typing import Any

from hindsight_api.engine.providers.litellm_llm import LiteLLMLLM
from hindsight_api.engine.providers.model_capabilities import supports_openai_compatible_reasoning

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -94,6 +95,7 @@ def __init__(
# deployment Router picks. Uses LiteLLM's own per-model registry; unknown
# models contribute no cap. See LiteLLMLLM._cap_max_completion_tokens.
self._router_output_cap = self._compute_router_output_cap(config)
self._router_omits_temperature = self._config_has_temperature_rejecting_model(config)

logger.info("LiteLLM Router initialized; entrypoint model_name=%r", _ENTRYPOINT_MODEL_NAME)

Expand Down Expand Up @@ -130,6 +132,48 @@ def _resolve_completion_model(self, response: Any) -> str:
def _get_model_output_cap(self) -> int | None:
return self._router_output_cap

def _should_omit_temperature(self) -> bool:
return bool(getattr(self, "_router_omits_temperature", False))

def _config_has_temperature_rejecting_model(self, config: dict[str, Any]) -> bool:
reachable_model_names = self._reachable_model_names(config)
for deployment in (config.get("model_list") or []) if isinstance(config, dict) else []:
if not isinstance(deployment, dict):
continue
model_name = deployment.get("model_name")
if model_name not in reachable_model_names:
continue
params = deployment.get("litellm_params") or {}
model_str = params.get("model") if isinstance(params, dict) else None
if model_str and supports_openai_compatible_reasoning(model_str):
return True
return False

def _reachable_model_names(self, config: dict[str, Any]) -> set[str]:
reachable = {_ENTRYPOINT_MODEL_NAME}
pending = [_ENTRYPOINT_MODEL_NAME]
fallback_specs = []
for key in ("fallbacks", "context_window_fallbacks"):
value = config.get(key) if isinstance(config, dict) else None
if isinstance(value, list):
fallback_specs.extend(value)

while pending:
source = pending.pop()
for spec in fallback_specs:
if not isinstance(spec, dict):
continue
targets = spec.get(source)
if isinstance(targets, str):
targets = [targets]
if not isinstance(targets, list):
continue
for target in targets:
if isinstance(target, str) and target not in reachable:
reachable.add(target)
pending.append(target)
return reachable

def _build_common_kwargs(
self,
messages: list[dict[str, Any]],
Expand All @@ -144,7 +188,7 @@ def _build_common_kwargs(
}
if max_completion_tokens is not None:
kwargs["max_completion_tokens"] = self._cap_max_completion_tokens(max_completion_tokens)
if temperature is not None:
if temperature is not None and not self._should_omit_temperature():
kwargs["temperature"] = temperature

# Forward operator-configured default headers as ``extra_headers`` so they
Expand All @@ -154,8 +198,9 @@ def _build_common_kwargs(
# concern, so we inject them here too — mirroring the base provider.
# ``setdefault`` keeps any explicit per-call ``extra_headers`` authoritative;
# a per-call copy prevents LiteLLM/downstream from mutating the stored dict.
if self._default_headers:
kwargs.setdefault("extra_headers", dict(self._default_headers))
default_headers = getattr(self, "_default_headers", {})
if default_headers:
kwargs.setdefault("extra_headers", dict(default_headers))
return kwargs

async def verify_connection(self) -> None:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"""Shared provider model capability helpers."""


def supports_openai_compatible_reasoning(model: str) -> bool:
"""Return True for OpenAI-compatible reasoning model names."""
model_lower = (model or "").lower()
if "deepseek" in model_lower:
# DeepSeek v4-flash is the non-thinking route. Treating every
# DeepSeek model as reasoning injects unsupported reasoning params.
return any(x in model_lower for x in ["v4-pro", "reasoner", "r1", "thinking"])
return any(x in model_lower for x in ["gpt-5", "o1", "o3"])
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from hindsight_api.engine.bank_attribution import apply_bank_attribution
from hindsight_api.engine.llm_interface import LLMInterface, OutputTooLongError, ProviderRateLimitResetError
from hindsight_api.engine.llm_trace import LLMResponseUsage, stash_response_usage
from hindsight_api.engine.providers.model_capabilities import supports_openai_compatible_reasoning
from hindsight_api.engine.response_models import LLMToolCall, LLMToolCallResult, TokenUsage
from hindsight_api.metrics import get_metrics_collector
from hindsight_api.worker.stage import set_stage
Expand Down Expand Up @@ -594,13 +595,7 @@ async def verify_connection(self) -> None:

def _supports_reasoning_model(self) -> bool:
"""Check if the current model is a reasoning model (o1, o3, GPT-5, DeepSeek)."""
model_lower = self.model.lower()
if "deepseek" in model_lower:
# DeepSeek v4-flash is the non-thinking route. Treating every
# DeepSeek model as a reasoning model injects reasoning_effort,
# which conflicts with thinking-disabled flash calls.
return any(x in model_lower for x in ["v4-pro", "reasoner", "r1", "thinking"])
return any(x in model_lower for x in ["gpt-5", "o1", "o3"])
return supports_openai_compatible_reasoning(self.model)

def _get_max_reasoning_tokens(self) -> int | None:
"""Get max reasoning tokens for reasoning models."""
Expand Down
43 changes: 41 additions & 2 deletions hindsight-api-slim/tests/test_llm_extra_body.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,15 +349,15 @@ class StructuredAnswer(BaseModel):
# ─── LiteLLM ──────────────────────────────────────────────────────────────────


def _make_litellm_provider(extra_body=None, default_headers=None):
def _make_litellm_provider(extra_body=None, default_headers=None, model="gpt-4o"):
pytest.importorskip("litellm")
from hindsight_api.engine.providers.litellm_llm import LiteLLMLLM

return LiteLLMLLM(
provider="litellm",
api_key="fake-key",
base_url="",
model="gpt-4o",
model=model,
extra_body=extra_body,
default_headers=default_headers,
)
Expand All @@ -375,6 +375,18 @@ def _fake_litellm_response():
return resp


async def _call_litellm_provider(provider, temperature=0.1):
provider._acompletion = AsyncMock(return_value=_fake_litellm_response())
with patch("hindsight_api.engine.providers.litellm_llm.get_metrics_collector"):
await provider.call(
messages=[{"role": "user", "content": "hi"}],
temperature=temperature,
scope="test",
max_retries=0,
)
return provider._acompletion.call_args.kwargs


def test_litellm_stores_extra_body():
provider = _make_litellm_provider(extra_body=EXTRA_BODY)
assert provider._extra_body == EXTRA_BODY
Expand Down Expand Up @@ -406,6 +418,33 @@ async def test_litellm_explicit_param_wins_over_extra_body():
assert provider._acompletion.call_args.kwargs.get("temperature") == 0.9


@pytest.mark.asyncio
@pytest.mark.parametrize(
("model", "expected_temperature"),
[
("azure/gpt-5.5", None),
("openai/o3-mini", None),
("deepseek/deepseek-reasoner", None),
("deepseek/deepseek-v4-flash", 0.1),
],
)
async def test_litellm_temperature_for_reasoning_models(model, expected_temperature):
kwargs = await _call_litellm_provider(_make_litellm_provider(model=model))

if expected_temperature is None:
assert "temperature" not in kwargs
else:
assert kwargs["temperature"] == expected_temperature


@pytest.mark.asyncio
async def test_litellm_gpt5_omits_extra_body_temperature():
provider = _make_litellm_provider(extra_body=EXTRA_BODY, model="azure/gpt-5.5")
kwargs = await _call_litellm_provider(provider, temperature=None)
assert "temperature" not in kwargs
assert kwargs.get("top_p") == 0.9


def test_litellm_router_forwards_extra_body():
"""The Router subclass forwards extra_body through to the shared LiteLLM base."""
pytest.importorskip("litellm")
Expand Down
59 changes: 59 additions & 0 deletions hindsight-api-slim/tests/test_llm_router_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,22 @@ def mock_router_response() -> MagicMock:
return response


def _router_config(
default_model: str,
*,
extra_model: str | None = None,
extra_name: str = "fallback",
fallbacks: list[dict[str, list[str]]] | None = None,
) -> dict[str, Any]:
model_list = [{"model_name": "default", "litellm_params": {"model": default_model, "api_key": "sk"}}]
if extra_model:
model_list.append({"model_name": extra_name, "litellm_params": {"model": extra_model, "api_key": "sk"}})
config: dict[str, Any] = {"model_list": model_list}
if fallbacks:
config["fallbacks"] = fallbacks
return config


# --- config parsing ----------------------------------------------------------


Expand Down Expand Up @@ -202,6 +218,20 @@ def _make_router_provider(config: dict[str, Any], mock_router: Any) -> LiteLLMRo


class TestRouterCall:
async def _call_with_temperature(self, config, mock_router_response):
mock_router = MagicMock()
mock_router.acompletion = AsyncMock(return_value=mock_router_response)
provider = _make_router_provider(config, mock_router)
provider._router_omits_temperature = provider._config_has_temperature_rejecting_model(config)

await provider.call(
messages=[{"role": "user", "content": "hi"}],
temperature=0.1,
max_retries=0,
)

return mock_router.acompletion.await_args.kwargs

@pytest.mark.asyncio
async def test_plain_text_call_targets_default_entrypoint(self, two_step_config, mock_router_response):
mock_router = MagicMock()
Expand Down Expand Up @@ -314,6 +344,35 @@ async def test_no_cap_when_litellm_registry_has_no_data(self, two_step_config, m
kwargs = mock_router.acompletion.await_args.kwargs
assert kwargs["max_completion_tokens"] == 64000

@pytest.mark.asyncio
@pytest.mark.parametrize(
("config", "expected_temperature"),
[
(_router_config("azure/gpt-5.5"), None),
(_router_config("openai/gpt-4o-mini", extra_model="azure/gpt-5.5", extra_name="unused"), 0.1),
(
_router_config(
"openai/gpt-4o-mini",
extra_model="azure/gpt-5.5",
fallbacks=[{"default": ["fallback"]}],
),
None,
),
],
)
async def test_router_temperature_for_reachable_reasoning_models(
self,
config,
expected_temperature,
mock_router_response,
):
kwargs = await self._call_with_temperature(config, mock_router_response)
assert kwargs["model"] == "default"
if expected_temperature is None:
assert "temperature" not in kwargs
else:
assert kwargs["temperature"] == expected_temperature

@pytest.mark.asyncio
async def test_call_with_tools(self, two_step_config):
response = MagicMock()
Expand Down