Skip to content

Commit 1202e33

Browse files
fix(litellm): omit temperature for reasoning models
Share the OpenAI-compatible reasoning model detector between the native OpenAI-compatible provider and LiteLLM-backed providers. Skip forwarding explicit temperature values through LiteLLM for those models, including Azure GPT-5 deployments, o-series models, and DeepSeek reasoning routes.
1 parent b0038e9 commit 1202e33

6 files changed

Lines changed: 142 additions & 11 deletions

File tree

hindsight-api-slim/hindsight_api/engine/providers/litellm_llm.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,19 @@
2424
from hindsight_api.config import DEFAULT_LLM_TIMEOUT, ENV_LLM_TIMEOUT
2525
from hindsight_api.engine.llm_interface import LLMInterface, OutputTooLongError
2626
from hindsight_api.engine.llm_trace import LLMResponseUsage, stash_response_usage
27+
from hindsight_api.engine.providers.model_capabilities import supports_openai_compatible_reasoning
2728
from hindsight_api.engine.response_models import LLMToolCall, LLMToolCallResult, TokenUsage
2829
from hindsight_api.metrics import get_metrics_collector
2930
from hindsight_api.worker.stage import set_stage
3031

3132
logger = logging.getLogger(__name__)
3233

3334

35+
def _model_rejects_temperature(model: str) -> bool:
36+
"""Return True for reasoning models that reject explicit temperature."""
37+
return supports_openai_compatible_reasoning(model)
38+
39+
3440
def _usage_from_litellm_response(response: Any) -> LLMResponseUsage:
3541
"""Extract prompt/completion/cached token counts from a LiteLLM (OpenAI-shaped) usage block."""
3642
usage = getattr(response, "usage", None)
@@ -144,12 +150,18 @@ def _build_common_kwargs(
144150
for key, value in self._extra_body.items():
145151
kwargs.setdefault(key, value)
146152

153+
if self._should_omit_temperature():
154+
kwargs.pop("temperature", None)
155+
147156
# Bedrock service tier: flex (50% cheaper), priority, or reserved
148157
if self.model.startswith("bedrock/") and self.bedrock_service_tier is not None:
149158
kwargs["service_tier"] = self.bedrock_service_tier
150159

151160
return kwargs
152161

162+
def _should_omit_temperature(self) -> bool:
163+
return _model_rejects_temperature(self.model)
164+
153165
# ── per-model output-tokens cap (shared with Router subclass) ────────────
154166
# Hindsight's defaults (e.g. retain_max_completion_tokens=64000) target
155167
# high-capacity models. When a configured deployment supports fewer

hindsight-api-slim/hindsight_api/engine/providers/litellm_router_llm.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
import logging
3737
from typing import Any
3838

39-
from hindsight_api.engine.providers.litellm_llm import LiteLLMLLM
39+
from hindsight_api.engine.providers.litellm_llm import LiteLLMLLM, _model_rejects_temperature
4040

4141
logger = logging.getLogger(__name__)
4242

@@ -94,6 +94,7 @@ def __init__(
9494
# deployment Router picks. Uses LiteLLM's own per-model registry; unknown
9595
# models contribute no cap. See LiteLLMLLM._cap_max_completion_tokens.
9696
self._router_output_cap = self._compute_router_output_cap(config)
97+
self._router_omits_temperature = self._config_has_temperature_rejecting_model(config)
9798

9899
logger.info("LiteLLM Router initialized; entrypoint model_name=%r", _ENTRYPOINT_MODEL_NAME)
99100

@@ -130,6 +131,19 @@ def _resolve_completion_model(self, response: Any) -> str:
130131
def _get_model_output_cap(self) -> int | None:
131132
return self._router_output_cap
132133

134+
def _should_omit_temperature(self) -> bool:
135+
return bool(getattr(self, "_router_omits_temperature", False))
136+
137+
def _config_has_temperature_rejecting_model(self, config: dict[str, Any]) -> bool:
138+
for deployment in (config.get("model_list") or []) if isinstance(config, dict) else []:
139+
if not isinstance(deployment, dict):
140+
continue
141+
params = deployment.get("litellm_params") or {}
142+
model_str = params.get("model") if isinstance(params, dict) else None
143+
if model_str and _model_rejects_temperature(model_str):
144+
return True
145+
return False
146+
133147
def _build_common_kwargs(
134148
self,
135149
messages: list[dict[str, Any]],
@@ -144,7 +158,7 @@ def _build_common_kwargs(
144158
}
145159
if max_completion_tokens is not None:
146160
kwargs["max_completion_tokens"] = self._cap_max_completion_tokens(max_completion_tokens)
147-
if temperature is not None:
161+
if temperature is not None and not self._should_omit_temperature():
148162
kwargs["temperature"] = temperature
149163
return kwargs
150164

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
"""Shared provider model capability helpers."""
2+
3+
4+
def supports_openai_compatible_reasoning(model: str) -> bool:
5+
"""Return True for OpenAI-compatible reasoning model names."""
6+
model_lower = (model or "").lower()
7+
if "deepseek" in model_lower:
8+
# DeepSeek v4-flash is the non-thinking route. Treating every
9+
# DeepSeek model as reasoning injects unsupported reasoning params.
10+
return any(x in model_lower for x in ["v4-pro", "reasoner", "r1", "thinking"])
11+
return any(x in model_lower for x in ["gpt-5", "o1", "o3"])

hindsight-api-slim/hindsight_api/engine/providers/openai_compatible_llm.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from hindsight_api.engine.bank_attribution import apply_bank_attribution
3939
from hindsight_api.engine.llm_interface import LLMInterface, OutputTooLongError, ProviderRateLimitResetError
4040
from hindsight_api.engine.llm_trace import LLMResponseUsage, stash_response_usage
41+
from hindsight_api.engine.providers.model_capabilities import supports_openai_compatible_reasoning
4142
from hindsight_api.engine.response_models import LLMToolCall, LLMToolCallResult, TokenUsage
4243
from hindsight_api.metrics import get_metrics_collector
4344
from hindsight_api.worker.stage import set_stage
@@ -594,13 +595,7 @@ async def verify_connection(self) -> None:
594595

595596
def _supports_reasoning_model(self) -> bool:
596597
"""Check if the current model is a reasoning model (o1, o3, GPT-5, DeepSeek)."""
597-
model_lower = self.model.lower()
598-
if "deepseek" in model_lower:
599-
# DeepSeek v4-flash is the non-thinking route. Treating every
600-
# DeepSeek model as a reasoning model injects reasoning_effort,
601-
# which conflicts with thinking-disabled flash calls.
602-
return any(x in model_lower for x in ["v4-pro", "reasoner", "r1", "thinking"])
603-
return any(x in model_lower for x in ["gpt-5", "o1", "o3"])
598+
return supports_openai_compatible_reasoning(self.model)
604599

605600
def _get_max_reasoning_tokens(self) -> int | None:
606601
"""Get max reasoning tokens for reasoning models."""

hindsight-api-slim/tests/test_llm_extra_body.py

Lines changed: 76 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -348,15 +348,15 @@ class StructuredAnswer(BaseModel):
348348
# ─── LiteLLM ──────────────────────────────────────────────────────────────────
349349

350350

351-
def _make_litellm_provider(extra_body=None):
351+
def _make_litellm_provider(extra_body=None, model="gpt-4o"):
352352
pytest.importorskip("litellm")
353353
from hindsight_api.engine.providers.litellm_llm import LiteLLMLLM
354354

355355
return LiteLLMLLM(
356356
provider="litellm",
357357
api_key="fake-key",
358358
base_url="",
359-
model="gpt-4o",
359+
model=model,
360360
extra_body=extra_body,
361361
)
362362

@@ -404,6 +404,80 @@ async def test_litellm_explicit_param_wins_over_extra_body():
404404
assert provider._acompletion.call_args.kwargs.get("temperature") == 0.9
405405

406406

407+
@pytest.mark.asyncio
408+
async def test_litellm_gpt5_omits_explicit_temperature():
409+
provider = _make_litellm_provider(model="azure/gpt-5.5")
410+
provider._acompletion = AsyncMock(return_value=_fake_litellm_response())
411+
412+
with patch("hindsight_api.engine.providers.litellm_llm.get_metrics_collector"):
413+
await provider.call(
414+
messages=[{"role": "user", "content": "hi"}],
415+
temperature=0.1,
416+
scope="test",
417+
max_retries=0,
418+
)
419+
420+
assert "temperature" not in provider._acompletion.call_args.kwargs
421+
422+
423+
@pytest.mark.asyncio
424+
async def test_litellm_o_series_omits_explicit_temperature():
425+
provider = _make_litellm_provider(model="openai/o3-mini")
426+
provider._acompletion = AsyncMock(return_value=_fake_litellm_response())
427+
428+
with patch("hindsight_api.engine.providers.litellm_llm.get_metrics_collector"):
429+
await provider.call(
430+
messages=[{"role": "user", "content": "hi"}],
431+
temperature=0.1,
432+
scope="test",
433+
max_retries=0,
434+
)
435+
436+
assert "temperature" not in provider._acompletion.call_args.kwargs
437+
438+
439+
@pytest.mark.asyncio
440+
async def test_litellm_deepseek_reasoning_omits_temperature_but_flash_keeps_it():
441+
reasoner = _make_litellm_provider(model="deepseek/deepseek-reasoner")
442+
reasoner._acompletion = AsyncMock(return_value=_fake_litellm_response())
443+
flash = _make_litellm_provider(model="deepseek/deepseek-v4-flash")
444+
flash._acompletion = AsyncMock(return_value=_fake_litellm_response())
445+
446+
with patch("hindsight_api.engine.providers.litellm_llm.get_metrics_collector"):
447+
await reasoner.call(
448+
messages=[{"role": "user", "content": "hi"}],
449+
temperature=0.1,
450+
scope="test",
451+
max_retries=0,
452+
)
453+
await flash.call(
454+
messages=[{"role": "user", "content": "hi"}],
455+
temperature=0.1,
456+
scope="test",
457+
max_retries=0,
458+
)
459+
460+
assert "temperature" not in reasoner._acompletion.call_args.kwargs
461+
assert flash._acompletion.call_args.kwargs.get("temperature") == 0.1
462+
463+
464+
@pytest.mark.asyncio
465+
async def test_litellm_gpt5_omits_extra_body_temperature():
466+
provider = _make_litellm_provider(extra_body=EXTRA_BODY, model="azure/gpt-5.5")
467+
provider._acompletion = AsyncMock(return_value=_fake_litellm_response())
468+
469+
with patch("hindsight_api.engine.providers.litellm_llm.get_metrics_collector"):
470+
await provider.call(
471+
messages=[{"role": "user", "content": "hi"}],
472+
scope="test",
473+
max_retries=0,
474+
)
475+
476+
kwargs = provider._acompletion.call_args.kwargs
477+
assert "temperature" not in kwargs
478+
assert kwargs.get("top_p") == 0.9
479+
480+
407481
def test_litellm_router_forwards_extra_body():
408482
"""The Router subclass forwards extra_body through to the shared LiteLLM base."""
409483
pytest.importorskip("litellm")

hindsight-api-slim/tests/test_llm_router_provider.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,31 @@ async def test_no_cap_when_litellm_registry_has_no_data(self, two_step_config, m
314314
kwargs = mock_router.acompletion.await_args.kwargs
315315
assert kwargs["max_completion_tokens"] == 64000
316316

317+
@pytest.mark.asyncio
318+
async def test_gpt5_deployment_omits_temperature(self, mock_router_response):
319+
mock_router = MagicMock()
320+
mock_router.acompletion = AsyncMock(return_value=mock_router_response)
321+
config = {
322+
"model_list": [
323+
{
324+
"model_name": "default",
325+
"litellm_params": {"model": "azure/gpt-5.5", "api_key": "sk-primary"},
326+
}
327+
]
328+
}
329+
provider = _make_router_provider(config, mock_router)
330+
provider._router_omits_temperature = provider._config_has_temperature_rejecting_model(config)
331+
332+
await provider.call(
333+
messages=[{"role": "user", "content": "hi"}],
334+
temperature=0.1,
335+
max_retries=0,
336+
)
337+
338+
kwargs = mock_router.acompletion.await_args.kwargs
339+
assert kwargs["model"] == "default"
340+
assert "temperature" not in kwargs
341+
317342
@pytest.mark.asyncio
318343
async def test_call_with_tools(self, two_step_config):
319344
response = MagicMock()

0 commit comments

Comments
 (0)