diff --git a/hindsight-api-slim/hindsight_api/engine/providers/litellm_llm.py b/hindsight-api-slim/hindsight_api/engine/providers/litellm_llm.py index 9bbf8e876..111aaafb2 100644 --- a/hindsight-api-slim/hindsight_api/engine/providers/litellm_llm.py +++ b/hindsight-api-slim/hindsight_api/engine/providers/litellm_llm.py @@ -24,6 +24,7 @@ from hindsight_api.config import DEFAULT_LLM_TIMEOUT, ENV_LLM_TIMEOUT from hindsight_api.engine.llm_interface import LLMInterface, OutputTooLongError from hindsight_api.engine.llm_trace import LLMResponseUsage, stash_response_usage +from hindsight_api.engine.providers.model_capabilities import supports_openai_compatible_reasoning from hindsight_api.engine.response_models import LLMToolCall, LLMToolCallResult, TokenUsage from hindsight_api.metrics import get_metrics_collector from hindsight_api.worker.stage import set_stage @@ -159,12 +160,18 @@ def _build_common_kwargs( if self._default_headers: kwargs.setdefault("extra_headers", dict(self._default_headers)) + if self._should_omit_temperature(): + kwargs.pop("temperature", None) + # Bedrock service tier: flex (50% cheaper), priority, or reserved if self.model.startswith("bedrock/") and self.bedrock_service_tier is not None: kwargs["service_tier"] = self.bedrock_service_tier return kwargs + def _should_omit_temperature(self) -> bool: + return supports_openai_compatible_reasoning(self.model) + # ── per-model output-tokens cap (shared with Router subclass) ──────────── # Hindsight's defaults (e.g. retain_max_completion_tokens=64000) target # high-capacity models. When a configured deployment supports fewer diff --git a/hindsight-api-slim/hindsight_api/engine/providers/litellm_router_llm.py b/hindsight-api-slim/hindsight_api/engine/providers/litellm_router_llm.py index e3a0528b3..5dc91cb6a 100644 --- a/hindsight-api-slim/hindsight_api/engine/providers/litellm_router_llm.py +++ b/hindsight-api-slim/hindsight_api/engine/providers/litellm_router_llm.py @@ -37,6 +37,7 @@ from typing import Any from hindsight_api.engine.providers.litellm_llm import LiteLLMLLM +from hindsight_api.engine.providers.model_capabilities import supports_openai_compatible_reasoning logger = logging.getLogger(__name__) @@ -94,6 +95,7 @@ def __init__( # deployment Router picks. Uses LiteLLM's own per-model registry; unknown # models contribute no cap. See LiteLLMLLM._cap_max_completion_tokens. self._router_output_cap = self._compute_router_output_cap(config) + self._router_omits_temperature = self._config_has_temperature_rejecting_model(config) logger.info("LiteLLM Router initialized; entrypoint model_name=%r", _ENTRYPOINT_MODEL_NAME) @@ -130,6 +132,48 @@ def _resolve_completion_model(self, response: Any) -> str: def _get_model_output_cap(self) -> int | None: return self._router_output_cap + def _should_omit_temperature(self) -> bool: + return bool(getattr(self, "_router_omits_temperature", False)) + + def _config_has_temperature_rejecting_model(self, config: dict[str, Any]) -> bool: + reachable_model_names = self._reachable_model_names(config) + for deployment in (config.get("model_list") or []) if isinstance(config, dict) else []: + if not isinstance(deployment, dict): + continue + model_name = deployment.get("model_name") + if model_name not in reachable_model_names: + continue + params = deployment.get("litellm_params") or {} + model_str = params.get("model") if isinstance(params, dict) else None + if model_str and supports_openai_compatible_reasoning(model_str): + return True + return False + + def _reachable_model_names(self, config: dict[str, Any]) -> set[str]: + reachable = {_ENTRYPOINT_MODEL_NAME} + pending = [_ENTRYPOINT_MODEL_NAME] + fallback_specs = [] + for key in ("fallbacks", "context_window_fallbacks"): + value = config.get(key) if isinstance(config, dict) else None + if isinstance(value, list): + fallback_specs.extend(value) + + while pending: + source = pending.pop() + for spec in fallback_specs: + if not isinstance(spec, dict): + continue + targets = spec.get(source) + if isinstance(targets, str): + targets = [targets] + if not isinstance(targets, list): + continue + for target in targets: + if isinstance(target, str) and target not in reachable: + reachable.add(target) + pending.append(target) + return reachable + def _build_common_kwargs( self, messages: list[dict[str, Any]], @@ -144,7 +188,7 @@ def _build_common_kwargs( } if max_completion_tokens is not None: kwargs["max_completion_tokens"] = self._cap_max_completion_tokens(max_completion_tokens) - if temperature is not None: + if temperature is not None and not self._should_omit_temperature(): kwargs["temperature"] = temperature # Forward operator-configured default headers as ``extra_headers`` so they @@ -154,8 +198,9 @@ def _build_common_kwargs( # concern, so we inject them here too — mirroring the base provider. # ``setdefault`` keeps any explicit per-call ``extra_headers`` authoritative; # a per-call copy prevents LiteLLM/downstream from mutating the stored dict. - if self._default_headers: - kwargs.setdefault("extra_headers", dict(self._default_headers)) + default_headers = getattr(self, "_default_headers", {}) + if default_headers: + kwargs.setdefault("extra_headers", dict(default_headers)) return kwargs async def verify_connection(self) -> None: diff --git a/hindsight-api-slim/hindsight_api/engine/providers/model_capabilities.py b/hindsight-api-slim/hindsight_api/engine/providers/model_capabilities.py new file mode 100644 index 000000000..8ad46f1f8 --- /dev/null +++ b/hindsight-api-slim/hindsight_api/engine/providers/model_capabilities.py @@ -0,0 +1,11 @@ +"""Shared provider model capability helpers.""" + + +def supports_openai_compatible_reasoning(model: str) -> bool: + """Return True for OpenAI-compatible reasoning model names.""" + model_lower = (model or "").lower() + if "deepseek" in model_lower: + # DeepSeek v4-flash is the non-thinking route. Treating every + # DeepSeek model as reasoning injects unsupported reasoning params. + return any(x in model_lower for x in ["v4-pro", "reasoner", "r1", "thinking"]) + return any(x in model_lower for x in ["gpt-5", "o1", "o3"]) diff --git a/hindsight-api-slim/hindsight_api/engine/providers/openai_compatible_llm.py b/hindsight-api-slim/hindsight_api/engine/providers/openai_compatible_llm.py index aade75602..6fafb0844 100644 --- a/hindsight-api-slim/hindsight_api/engine/providers/openai_compatible_llm.py +++ b/hindsight-api-slim/hindsight_api/engine/providers/openai_compatible_llm.py @@ -38,6 +38,7 @@ from hindsight_api.engine.bank_attribution import apply_bank_attribution from hindsight_api.engine.llm_interface import LLMInterface, OutputTooLongError, ProviderRateLimitResetError from hindsight_api.engine.llm_trace import LLMResponseUsage, stash_response_usage +from hindsight_api.engine.providers.model_capabilities import supports_openai_compatible_reasoning from hindsight_api.engine.response_models import LLMToolCall, LLMToolCallResult, TokenUsage from hindsight_api.metrics import get_metrics_collector from hindsight_api.worker.stage import set_stage @@ -594,13 +595,7 @@ async def verify_connection(self) -> None: def _supports_reasoning_model(self) -> bool: """Check if the current model is a reasoning model (o1, o3, GPT-5, DeepSeek).""" - model_lower = self.model.lower() - if "deepseek" in model_lower: - # DeepSeek v4-flash is the non-thinking route. Treating every - # DeepSeek model as a reasoning model injects reasoning_effort, - # which conflicts with thinking-disabled flash calls. - return any(x in model_lower for x in ["v4-pro", "reasoner", "r1", "thinking"]) - return any(x in model_lower for x in ["gpt-5", "o1", "o3"]) + return supports_openai_compatible_reasoning(self.model) def _get_max_reasoning_tokens(self) -> int | None: """Get max reasoning tokens for reasoning models.""" diff --git a/hindsight-api-slim/tests/test_llm_extra_body.py b/hindsight-api-slim/tests/test_llm_extra_body.py index 6d500bdf1..310dbc434 100644 --- a/hindsight-api-slim/tests/test_llm_extra_body.py +++ b/hindsight-api-slim/tests/test_llm_extra_body.py @@ -349,7 +349,7 @@ class StructuredAnswer(BaseModel): # ─── LiteLLM ────────────────────────────────────────────────────────────────── -def _make_litellm_provider(extra_body=None, default_headers=None): +def _make_litellm_provider(extra_body=None, default_headers=None, model="gpt-4o"): pytest.importorskip("litellm") from hindsight_api.engine.providers.litellm_llm import LiteLLMLLM @@ -357,7 +357,7 @@ def _make_litellm_provider(extra_body=None, default_headers=None): provider="litellm", api_key="fake-key", base_url="", - model="gpt-4o", + model=model, extra_body=extra_body, default_headers=default_headers, ) @@ -375,6 +375,18 @@ def _fake_litellm_response(): return resp +async def _call_litellm_provider(provider, temperature=0.1): + provider._acompletion = AsyncMock(return_value=_fake_litellm_response()) + with patch("hindsight_api.engine.providers.litellm_llm.get_metrics_collector"): + await provider.call( + messages=[{"role": "user", "content": "hi"}], + temperature=temperature, + scope="test", + max_retries=0, + ) + return provider._acompletion.call_args.kwargs + + def test_litellm_stores_extra_body(): provider = _make_litellm_provider(extra_body=EXTRA_BODY) assert provider._extra_body == EXTRA_BODY @@ -406,6 +418,33 @@ async def test_litellm_explicit_param_wins_over_extra_body(): assert provider._acompletion.call_args.kwargs.get("temperature") == 0.9 +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("model", "expected_temperature"), + [ + ("azure/gpt-5.5", None), + ("openai/o3-mini", None), + ("deepseek/deepseek-reasoner", None), + ("deepseek/deepseek-v4-flash", 0.1), + ], +) +async def test_litellm_temperature_for_reasoning_models(model, expected_temperature): + kwargs = await _call_litellm_provider(_make_litellm_provider(model=model)) + + if expected_temperature is None: + assert "temperature" not in kwargs + else: + assert kwargs["temperature"] == expected_temperature + + +@pytest.mark.asyncio +async def test_litellm_gpt5_omits_extra_body_temperature(): + provider = _make_litellm_provider(extra_body=EXTRA_BODY, model="azure/gpt-5.5") + kwargs = await _call_litellm_provider(provider, temperature=None) + assert "temperature" not in kwargs + assert kwargs.get("top_p") == 0.9 + + def test_litellm_router_forwards_extra_body(): """The Router subclass forwards extra_body through to the shared LiteLLM base.""" pytest.importorskip("litellm") diff --git a/hindsight-api-slim/tests/test_llm_router_provider.py b/hindsight-api-slim/tests/test_llm_router_provider.py index 3450b04bb..20716f0da 100644 --- a/hindsight-api-slim/tests/test_llm_router_provider.py +++ b/hindsight-api-slim/tests/test_llm_router_provider.py @@ -70,6 +70,22 @@ def mock_router_response() -> MagicMock: return response +def _router_config( + default_model: str, + *, + extra_model: str | None = None, + extra_name: str = "fallback", + fallbacks: list[dict[str, list[str]]] | None = None, +) -> dict[str, Any]: + model_list = [{"model_name": "default", "litellm_params": {"model": default_model, "api_key": "sk"}}] + if extra_model: + model_list.append({"model_name": extra_name, "litellm_params": {"model": extra_model, "api_key": "sk"}}) + config: dict[str, Any] = {"model_list": model_list} + if fallbacks: + config["fallbacks"] = fallbacks + return config + + # --- config parsing ---------------------------------------------------------- @@ -202,6 +218,20 @@ def _make_router_provider(config: dict[str, Any], mock_router: Any) -> LiteLLMRo class TestRouterCall: + async def _call_with_temperature(self, config, mock_router_response): + mock_router = MagicMock() + mock_router.acompletion = AsyncMock(return_value=mock_router_response) + provider = _make_router_provider(config, mock_router) + provider._router_omits_temperature = provider._config_has_temperature_rejecting_model(config) + + await provider.call( + messages=[{"role": "user", "content": "hi"}], + temperature=0.1, + max_retries=0, + ) + + return mock_router.acompletion.await_args.kwargs + @pytest.mark.asyncio async def test_plain_text_call_targets_default_entrypoint(self, two_step_config, mock_router_response): mock_router = MagicMock() @@ -314,6 +344,35 @@ async def test_no_cap_when_litellm_registry_has_no_data(self, two_step_config, m kwargs = mock_router.acompletion.await_args.kwargs assert kwargs["max_completion_tokens"] == 64000 + @pytest.mark.asyncio + @pytest.mark.parametrize( + ("config", "expected_temperature"), + [ + (_router_config("azure/gpt-5.5"), None), + (_router_config("openai/gpt-4o-mini", extra_model="azure/gpt-5.5", extra_name="unused"), 0.1), + ( + _router_config( + "openai/gpt-4o-mini", + extra_model="azure/gpt-5.5", + fallbacks=[{"default": ["fallback"]}], + ), + None, + ), + ], + ) + async def test_router_temperature_for_reachable_reasoning_models( + self, + config, + expected_temperature, + mock_router_response, + ): + kwargs = await self._call_with_temperature(config, mock_router_response) + assert kwargs["model"] == "default" + if expected_temperature is None: + assert "temperature" not in kwargs + else: + assert kwargs["temperature"] == expected_temperature + @pytest.mark.asyncio async def test_call_with_tools(self, two_step_config): response = MagicMock()