Skip to content

Commit 40fe7aa

Browse files
authored
fix(llm): propagate per-scope LLM timeout + retry policy to the provider (#2452) (#2470)
The per-operation LLM request settings were resolved into HindsightConfig but never reached the provider that uses them, so configuring them was a silent no-op: - *_llm_timeout (retain/reflect/consolidation) and the global llm_timeout never reached the provider impl; it fell back to HINDSIGHT_API_LLM_TIMEOUT/120s, so HINDSIGHT_API_RETAIN_LLM_TIMEOUT=300 did nothing ("LiteLLM call exceeded timeout=120.0s"). - reflect_llm_max_retries/initial_backoff/max_backoff and consolidation_llm_initial_backoff/max_backoff were never consumed; reflect and consolidation used the hardcoded call()/call_with_tools() defaults (10/5), ignoring the documented "falls back to llm_max_retries" contract. Fix: resolve each operation's effective request defaults (per-op override else global) in MemoryEngine and carry them on the LLMProvider: - timeout is threaded config -> LLMProvider -> create_llm_provider -> provider impl for the providers that honour a configurable request timeout (LiteLLM, LiteLLM Router, OpenAI-compatible, Nous). None preserves each provider's own default, so Anthropic/Gemini keep their bespoke timeouts and the no-config path is byte-identical. - max_retries/initial_backoff/max_backoff become LLMProvider instance defaults that call()/call_with_tools() use when the per-call arg is omitted. Explicit per-call args (retain's resolved values, reflect's fast structured-extraction path) still win; providers built without config (from_env, tests) keep the 10/5 method fallback. The four operation scopes (default/retain/reflect/consolidation) and multi-LLM chain members all share their operation's resolved values via a small _LLMCallDefaults bundle. max_concurrent is intentionally left as-is (process-global semaphores read from env at startup, server-level only); the docs are clarified to call out that distinction. Also fixes a pre-existing breakage in test_llm_router_provider's __new__-based helper (missing _default_headers after #2466) so the suite is green. Tests: tests/test_llm_timeout_propagation.py covers provider-impl timeout threading, the call() retry-policy fallback/override, and per-op resolution/fallback in MemoryEngine.
1 parent 7393400 commit 40fe7aa

7 files changed

Lines changed: 358 additions & 27 deletions

File tree

hindsight-api-slim/hindsight_api/engine/llm_wrapper.py

Lines changed: 86 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,7 @@ def create_llm_provider(
253253
prompt_cache_enabled: bool = False,
254254
litellmrouter_config: dict[str, Any] | None = None,
255255
gemini_service_tier: str | None = None,
256+
timeout: float | None = None,
256257
) -> Any: # Returns LLMInterface
257258
"""
258259
Factory function to create the appropriate LLM provider implementation.
@@ -280,6 +281,12 @@ def create_llm_provider(
280281
vertexai_project_id: Vertex AI project ID (for VertexAI provider).
281282
vertexai_region: Vertex AI region (for VertexAI provider).
282283
vertexai_credentials: Vertex AI credentials object (for VertexAI provider).
284+
timeout: Per-request LLM timeout in seconds (resolved by the caller from the
285+
per-operation/global config). Threaded into the providers that honour a
286+
configurable request timeout (LiteLLM, LiteLLM Router, OpenAI-compatible,
287+
Nous). ``None`` lets each provider fall back to its own default
288+
(``HINDSIGHT_API_LLM_TIMEOUT`` / ``DEFAULT_LLM_TIMEOUT`` for those four;
289+
Anthropic and Gemini keep their provider-specific defaults).
283290
284291
Returns:
285292
LLMInterface implementation for the specified provider.
@@ -378,6 +385,7 @@ def create_llm_provider(
378385
reasoning_effort=reasoning_effort,
379386
extra_body=extra_body,
380387
default_headers=default_headers,
388+
timeout=timeout,
381389
)
382390

383391
elif provider_lower == "litellmrouter":
@@ -397,6 +405,7 @@ def create_llm_provider(
397405
reasoning_effort=reasoning_effort,
398406
extra_body=extra_body,
399407
default_headers=default_headers,
408+
timeout=timeout,
400409
)
401410

402411
elif provider_lower == "bedrock":
@@ -411,6 +420,7 @@ def create_llm_provider(
411420
extra_body=extra_body,
412421
default_headers=default_headers,
413422
bedrock_service_tier=bedrock_service_tier,
423+
timeout=timeout,
414424
)
415425

416426
elif provider_lower == "llamacpp":
@@ -457,6 +467,7 @@ def create_llm_provider(
457467
model=model,
458468
reasoning_effort=reasoning_effort,
459469
extra_body=extra_body,
470+
timeout=timeout,
460471
)
461472

462473
elif provider_lower in (
@@ -483,6 +494,7 @@ def create_llm_provider(
483494
groq_service_tier=groq_service_tier,
484495
openai_service_tier=openai_service_tier,
485496
extra_body=extra_body,
497+
timeout=timeout,
486498
)
487499

488500
else:
@@ -515,6 +527,10 @@ def __init__(
515527
vertexai_project_id: str | None = None,
516528
vertexai_region: str | None = None,
517529
vertexai_service_account_key: str | None = None,
530+
timeout: float | None = None,
531+
max_retries: int | None = None,
532+
initial_backoff: float | None = None,
533+
max_backoff: float | None = None,
518534
):
519535
"""
520536
Initialize LLM provider.
@@ -543,6 +559,17 @@ def __init__(
543559
``"us-central1"`` when ``None``).
544560
vertexai_service_account_key: Path to a Vertex AI service-account key file for
545561
``provider="vertexai"`` (uses ADC when ``None``).
562+
timeout: Per-request LLM timeout in seconds. Resolved by the caller from the
563+
per-operation/global config (``retain_llm_timeout`` falling back to
564+
``llm_timeout``, etc.). ``None`` lets each provider apply its own default.
565+
max_retries: Default retry-attempt budget for ``call`` / ``call_with_tools``
566+
when the per-call argument is omitted. Resolved by the caller from the
567+
per-operation/global config (``reflect_llm_max_retries`` falling back to
568+
``llm_max_retries``, etc.). ``None`` keeps each method's own fallback.
569+
initial_backoff: Default initial retry backoff (seconds), same resolution as
570+
``max_retries``. ``None`` keeps each method's own fallback.
571+
max_backoff: Default maximum retry backoff (seconds), same resolution as
572+
``max_retries``. ``None`` keeps each method's own fallback.
546573
547574
This constructor uses every argument as passed and does not read global
548575
``HindsightConfig``: resolving the server-level default for a ``None`` argument is the
@@ -556,6 +583,15 @@ def __init__(
556583
self.base_url = base_url
557584
self.model = model
558585
self.reasoning_effort = reasoning_effort
586+
# Per-request timeout (seconds). Used verbatim — the caller resolves the
587+
# per-operation/global fallback. ``None`` defers to the provider default.
588+
self.timeout = timeout
589+
# Default retry policy for call()/call_with_tools(). The caller resolves the
590+
# per-operation/global fallback; ``None`` keeps each method's own fallback so
591+
# providers built without a resolved config (from_env, tests) are unchanged.
592+
self.max_retries = max_retries
593+
self.initial_backoff = initial_backoff
594+
self.max_backoff = max_backoff
559595
self.litellmrouter_config = litellmrouter_config
560596
# Service tiers from hierarchical config (not env vars)
561597
self.groq_service_tier = groq_service_tier
@@ -706,6 +742,7 @@ def __init__(
706742
gemini_safety_settings=self.gemini_safety_settings,
707743
prompt_cache_enabled=self.prompt_cache_enabled,
708744
litellmrouter_config=router_config,
745+
timeout=self.timeout,
709746
)
710747

711748
# Backward compatibility: Keep mock provider properties
@@ -762,9 +799,9 @@ async def call(
762799
max_completion_tokens: int | None = None,
763800
temperature: float | None = None,
764801
scope: str = "memory",
765-
max_retries: int = 10,
766-
initial_backoff: float = 1.0,
767-
max_backoff: float = 60.0,
802+
max_retries: int | None = None,
803+
initial_backoff: float | None = None,
804+
max_backoff: float | None = None,
768805
skip_validation: bool = False,
769806
strict_schema: bool = False,
770807
return_usage: bool = False,
@@ -779,9 +816,12 @@ async def call(
779816
max_completion_tokens: Maximum tokens in response.
780817
temperature: Sampling temperature (0.0-2.0).
781818
scope: Scope identifier for tracking.
782-
max_retries: Maximum retry attempts.
783-
initial_backoff: Initial backoff time in seconds.
784-
max_backoff: Maximum backoff time in seconds.
819+
max_retries: Maximum retry attempts. ``None`` uses the provider's configured
820+
default (per-operation/global ``llm_max_retries``), else 10.
821+
initial_backoff: Initial backoff time in seconds. ``None`` uses the provider's
822+
configured default (``llm_initial_backoff``), else 1.0.
823+
max_backoff: Maximum backoff time in seconds. ``None`` uses the provider's
824+
configured default (``llm_max_backoff``), else 60.0.
785825
skip_validation: Return raw JSON without Pydantic validation.
786826
strict_schema: Per-call override requesting grammar-enforced (json_schema strict)
787827
structured output instead of the soft json_object path. The server-level
@@ -806,6 +846,20 @@ async def call(
806846
structured = "+structured" if response_format is not None else ""
807847
set_stage(f"llm.{self.provider}.{scope}{structured}")
808848

849+
# Resolve the retry policy: explicit per-call arg wins, else the provider's
850+
# configured per-operation/global default, else this method's own fallback.
851+
max_retries = (
852+
max_retries if max_retries is not None else (self.max_retries if self.max_retries is not None else 10)
853+
)
854+
initial_backoff = (
855+
initial_backoff
856+
if initial_backoff is not None
857+
else (self.initial_backoff if self.initial_backoff is not None else 1.0)
858+
)
859+
max_backoff = (
860+
max_backoff if max_backoff is not None else (self.max_backoff if self.max_backoff is not None else 60.0)
861+
)
862+
809863
# Resolve strict-schema once, here, rather than in each provider: the
810864
# per-call argument OR the server-level HINDSIGHT_API_LLM_STRICT_SCHEMA
811865
# flag. Providers with a json_schema response_format (OpenAI-compatible,
@@ -908,9 +962,9 @@ async def call_with_tools(
908962
max_completion_tokens: int | None = None,
909963
temperature: float | None = None,
910964
scope: str = "tools",
911-
max_retries: int = 5,
912-
initial_backoff: float = 1.0,
913-
max_backoff: float = 30.0,
965+
max_retries: int | None = None,
966+
initial_backoff: float | None = None,
967+
max_backoff: float | None = None,
914968
tool_choice: str | dict[str, Any] = "auto",
915969
cached_prefix: str | None = None,
916970
) -> "LLMToolCallResult":
@@ -923,9 +977,12 @@ async def call_with_tools(
923977
max_completion_tokens: Maximum tokens in response.
924978
temperature: Sampling temperature (0.0-2.0).
925979
scope: Scope identifier for tracking.
926-
max_retries: Maximum retry attempts.
927-
initial_backoff: Initial backoff time in seconds.
928-
max_backoff: Maximum backoff time in seconds.
980+
max_retries: Maximum retry attempts. ``None`` uses the provider's configured
981+
default (per-operation/global ``llm_max_retries``), else 5.
982+
initial_backoff: Initial backoff time in seconds. ``None`` uses the provider's
983+
configured default (``llm_initial_backoff``), else 1.0.
984+
max_backoff: Maximum backoff time in seconds. ``None`` uses the provider's
985+
configured default (``llm_max_backoff``), else 30.0.
929986
tool_choice: How to choose tools - "auto", "none", "required", or {"type": "function", "function": {"name": "..."}}
930987
931988
Returns:
@@ -935,6 +992,20 @@ async def call_with_tools(
935992

936993
set_stage(f"llm.{self.provider}.{scope}+tools")
937994

995+
# Resolve the retry policy: explicit per-call arg wins, else the provider's
996+
# configured per-operation/global default, else this method's own fallback.
997+
max_retries = (
998+
max_retries if max_retries is not None else (self.max_retries if self.max_retries is not None else 5)
999+
)
1000+
initial_backoff = (
1001+
initial_backoff
1002+
if initial_backoff is not None
1003+
else (self.initial_backoff if self.initial_backoff is not None else 1.0)
1004+
)
1005+
max_backoff = (
1006+
max_backoff if max_backoff is not None else (self.max_backoff if self.max_backoff is not None else 30.0)
1007+
)
1008+
9381009
# Failures forwarded to the GenAI recorder; successes recorded by providers.
9391010
from ..tracing import get_span_recorder
9401011
from .llm_trace import (
@@ -1178,6 +1249,7 @@ def from_env(cls) -> "LLMProvider":
11781249
DEFAULT_LLM_PROMPT_CACHE_ENABLED,
11791250
DEFAULT_LLM_PROVIDER,
11801251
DEFAULT_LLM_REASONING_EFFORT,
1252+
DEFAULT_LLM_TIMEOUT,
11811253
ENV_LLM_API_KEY,
11821254
ENV_LLM_BASE_URL,
11831255
ENV_LLM_BEDROCK_SERVICE_TIER,
@@ -1192,6 +1264,7 @@ def from_env(cls) -> "LLMProvider":
11921264
ENV_LLM_PROMPT_CACHE_ENABLED,
11931265
ENV_LLM_PROVIDER,
11941266
ENV_LLM_REASONING_EFFORT,
1267+
ENV_LLM_TIMEOUT,
11951268
ENV_LLM_VERTEXAI_PROJECT_ID,
11961269
ENV_LLM_VERTEXAI_REGION,
11971270
ENV_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY,
@@ -1245,6 +1318,7 @@ def from_env(cls) -> "LLMProvider":
12451318
vertexai_project_id=os.getenv(ENV_LLM_VERTEXAI_PROJECT_ID) or None,
12461319
vertexai_region=os.getenv(ENV_LLM_VERTEXAI_REGION) or None,
12471320
vertexai_service_account_key=os.getenv(ENV_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY) or None,
1321+
timeout=float(os.getenv(ENV_LLM_TIMEOUT, str(DEFAULT_LLM_TIMEOUT))),
12481322
)
12491323

12501324

hindsight-api-slim/hindsight_api/engine/memory_engine.py

Lines changed: 70 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,33 @@ def validate_sql_schema(sql: str) -> None:
388388
RetainOutboxCallbackFactory = Callable[[list[RetainContentDict]], RetainOutboxCallback | None]
389389

390390

391-
def _member_to_llm(member: "LLMMemberConfig", config: HindsightConfig) -> LLMConfig:
391+
@dataclass(frozen=True)
392+
class _LLMCallDefaults:
393+
"""An operation's resolved per-request defaults, threaded into every provider
394+
of its multi-LLM chain.
395+
396+
Each field is the effective value after the per-op-override-else-global
397+
resolution (e.g. ``retain_llm_timeout`` falling back to ``llm_timeout``). They
398+
are carried on the ``LLMProvider`` and used by ``call``/``call_with_tools`` when
399+
the per-call argument is omitted — previously these per-op config fields were
400+
resolved but never reached the provider (issue #2452).
401+
"""
402+
403+
timeout: float | None
404+
max_retries: int | None
405+
initial_backoff: float | None
406+
max_backoff: float | None
407+
408+
def as_kwargs(self) -> dict[str, Any]:
409+
return {
410+
"timeout": self.timeout,
411+
"max_retries": self.max_retries,
412+
"initial_backoff": self.initial_backoff,
413+
"max_backoff": self.max_backoff,
414+
}
415+
416+
417+
def _member_to_llm(member: "LLMMemberConfig", config: HindsightConfig, defaults: _LLMCallDefaults) -> LLMConfig:
392418
"""Build an LLMProvider from one indexed multi-LLM member.
393419

394420
``LLMProvider`` uses its arguments verbatim (it no longer reads global config),
@@ -397,6 +423,10 @@ def _member_to_llm(member: "LLMMemberConfig", config: HindsightConfig) -> LLMCon
397423
(``gemini_safety_settings``, ``prompt_cache_enabled``) take the global default.
398424
``gemini_safety_settings`` is bank-configurable so it comes from the raw config
399425
(the proxy blocks it); the per-bank value is applied per-call downstream.
426+
427+
``defaults`` are the operation's already-resolved request defaults (timeout +
428+
retry policy). Members have no per-member knobs for these, so every member of a
429+
chain shares its operation's values.
400430
"""
401431
from ..config import _get_raw_config
402432

@@ -416,13 +446,15 @@ def _member_to_llm(member: "LLMMemberConfig", config: HindsightConfig) -> LLMCon
416446
vertexai_region=member.vertexai_region or config.llm_vertexai_region,
417447
vertexai_service_account_key=member.vertexai_service_account_key or config.llm_vertexai_service_account_key,
418448
litellmrouter_config=member.litellmrouter_config or config.llm_litellmrouter_config,
449+
**defaults.as_kwargs(),
419450
)
420451

421452

422453
def _build_llm(
423454
base: LLMConfig,
424455
config: HindsightConfig,
425456
prefix: str,
457+
defaults: _LLMCallDefaults,
426458
) -> "LLMConfig | MultiLLMProvider":
427459
"""Resolve an operation's multi-LLM chain and wrap ``base`` (member 0) in it.
428460

@@ -431,6 +463,9 @@ def _build_llm(
431463
inherits the global chain, mirroring how per-op base config falls back to the
432464
global LLM config. Returns ``base`` unchanged when no chain is configured
433465
(byte-identical hot path).
466+
467+
``defaults`` are the operation's resolved request defaults, applied to every
468+
fallback member so the whole chain shares the operation's effective settings.
434469
"""
435470
members: list[LLMMemberConfig] = getattr(config, f"{prefix}llm_members")
436471
strategy: LLMStrategyConfig | None = getattr(config, f"{prefix}llm_strategy")
@@ -442,7 +477,7 @@ def _build_llm(
442477

443478
if not strategy or not members:
444479
return base
445-
extra = [_member_to_llm(m, config) for m in members]
480+
extra = [_member_to_llm(m, config, defaults) for m in members]
446481
return MultiLLMProvider([base, *extra], strategy)
447482

448483

@@ -1023,6 +1058,29 @@ def __init__(
10231058

10241059
self.query_analyzer = DateparserQueryAnalyzer()
10251060

1061+
# Resolve each operation's effective per-request defaults: a per-op override
1062+
# (``HINDSIGHT_API_RETAIN_LLM_TIMEOUT``, ``..._MAX_RETRIES``, ``..._INITIAL_BACKOFF``,
1063+
# ``..._MAX_BACKOFF``) wins, otherwise the global ``llm_*``. Threaded all the way
1064+
# into the provider so the configured value actually governs the call (issue #2452);
1065+
# previously these per-op fields were resolved into config but never reached the
1066+
# provider, which silently used the global/method default.
1067+
def _op_defaults(prefix: str) -> _LLMCallDefaults:
1068+
def pick(field: str) -> Any:
1069+
per_op = getattr(config, f"{prefix}llm_{field}") if prefix else None
1070+
return per_op if per_op is not None else getattr(config, f"llm_{field}")
1071+
1072+
return _LLMCallDefaults(
1073+
timeout=pick("timeout"),
1074+
max_retries=pick("max_retries"),
1075+
initial_backoff=pick("initial_backoff"),
1076+
max_backoff=pick("max_backoff"),
1077+
)
1078+
1079+
default_call_defaults = _op_defaults("")
1080+
retain_call_defaults = _op_defaults("retain_")
1081+
reflect_call_defaults = _op_defaults("reflect_")
1082+
consolidation_call_defaults = _op_defaults("consolidation_")
1083+
10261084
# Initialize LLM configuration (default, used as fallback)
10271085
_default_base_llm = LLMConfig(
10281086
provider=memory_llm_provider,
@@ -1042,8 +1100,9 @@ def __init__(
10421100
vertexai_project_id=config.llm_vertexai_project_id,
10431101
vertexai_region=config.llm_vertexai_region,
10441102
vertexai_service_account_key=config.llm_vertexai_service_account_key,
1103+
**default_call_defaults.as_kwargs(),
10451104
)
1046-
self._llm_config = _build_llm(_default_base_llm, config, "")
1105+
self._llm_config = _build_llm(_default_base_llm, config, "", default_call_defaults)
10471106

10481107
# Store client and model for convenience (deprecated: use _llm_config.call() instead).
10491108
# Read from the primary member so a multi-LLM chain behaves like the base config here.
@@ -1085,8 +1144,9 @@ def __init__(
10851144
vertexai_project_id=config.llm_vertexai_project_id,
10861145
vertexai_region=config.llm_vertexai_region,
10871146
vertexai_service_account_key=config.llm_vertexai_service_account_key,
1147+
**retain_call_defaults.as_kwargs(),
10881148
)
1089-
self._retain_llm_config = _build_llm(_retain_base_llm, config, "retain_")
1149+
self._retain_llm_config = _build_llm(_retain_base_llm, config, "retain_", retain_call_defaults)
10901150

10911151
# Reflect LLM config - for think/observe operations (can use lighter models)
10921152
reflect_provider = reflect_llm_provider or config.reflect_llm_provider or memory_llm_provider
@@ -1122,8 +1182,9 @@ def __init__(
11221182
vertexai_project_id=config.llm_vertexai_project_id,
11231183
vertexai_region=config.llm_vertexai_region,
11241184
vertexai_service_account_key=config.llm_vertexai_service_account_key,
1185+
**reflect_call_defaults.as_kwargs(),
11251186
)
1126-
self._reflect_llm_config = _build_llm(_reflect_base_llm, config, "reflect_")
1187+
self._reflect_llm_config = _build_llm(_reflect_base_llm, config, "reflect_", reflect_call_defaults)
11271188

11281189
# Consolidation LLM config - for mental model consolidation (can use efficient models)
11291190
consolidation_provider = consolidation_llm_provider or config.consolidation_llm_provider or memory_llm_provider
@@ -1159,8 +1220,11 @@ def __init__(
11591220
vertexai_project_id=config.llm_vertexai_project_id,
11601221
vertexai_region=config.llm_vertexai_region,
11611222
vertexai_service_account_key=config.llm_vertexai_service_account_key,
1223+
**consolidation_call_defaults.as_kwargs(),
1224+
)
1225+
self._consolidation_llm_config = _build_llm(
1226+
_consolidation_base_llm, config, "consolidation_", consolidation_call_defaults
11621227
)
1163-
self._consolidation_llm_config = _build_llm(_consolidation_base_llm, config, "consolidation_")
11641228

11651229
# Initialize cross-encoder reranker (cached for performance)
11661230
self._cross_encoder_reranker = CrossEncoderReranker(cross_encoder=cross_encoder)

0 commit comments

Comments
 (0)