vectorize-io
diff --git a/‎hindsight-api-slim/hindsight_api/engine/llm_wrapper.py‎
Lines changed: 86 additions & 12 deletions b/‎hindsight-api-slim/hindsight_api/engine/llm_wrapper.py‎
Lines changed: 86 additions & 12 deletions
diff --git a/‎hindsight-api-slim/hindsight_api/engine/memory_engine.py‎
Lines changed: 70 additions & 6 deletions b/‎hindsight-api-slim/hindsight_api/engine/memory_engine.py‎
Lines changed: 70 additions & 6 deletions
@@ -253,6 +253,7 @@ def create_llm_provider(
     prompt_cache_enabled: bool = False,
     litellmrouter_config: dict[str, Any] | None = None,
     gemini_service_tier: str | None = None,
+    timeout: float | None = None,
 ) -> Any:  # Returns LLMInterface
     """
     Factory function to create the appropriate LLM provider implementation.
@@ -280,6 +281,12 @@ def create_llm_provider(
         vertexai_project_id: Vertex AI project ID (for VertexAI provider).
         vertexai_region: Vertex AI region (for VertexAI provider).
         vertexai_credentials: Vertex AI credentials object (for VertexAI provider).
+        timeout: Per-request LLM timeout in seconds (resolved by the caller from the
+            per-operation/global config). Threaded into the providers that honour a
+            configurable request timeout (LiteLLM, LiteLLM Router, OpenAI-compatible,
+            Nous). ``None`` lets each provider fall back to its own default
+            (``HINDSIGHT_API_LLM_TIMEOUT`` / ``DEFAULT_LLM_TIMEOUT`` for those four;
+            Anthropic and Gemini keep their provider-specific defaults).
 
     Returns:
         LLMInterface implementation for the specified provider.
@@ -378,6 +385,7 @@ def create_llm_provider(
             reasoning_effort=reasoning_effort,
             extra_body=extra_body,
             default_headers=default_headers,
+            timeout=timeout,
         )
 
     elif provider_lower == "litellmrouter":
@@ -397,6 +405,7 @@ def create_llm_provider(
             reasoning_effort=reasoning_effort,
             extra_body=extra_body,
             default_headers=default_headers,
+            timeout=timeout,
         )
 
     elif provider_lower == "bedrock":
@@ -411,6 +420,7 @@ def create_llm_provider(
             extra_body=extra_body,
             default_headers=default_headers,
             bedrock_service_tier=bedrock_service_tier,
+            timeout=timeout,
         )
 
     elif provider_lower == "llamacpp":
@@ -457,6 +467,7 @@ def create_llm_provider(
             model=model,
             reasoning_effort=reasoning_effort,
             extra_body=extra_body,
+            timeout=timeout,
         )
 
     elif provider_lower in (
@@ -483,6 +494,7 @@ def create_llm_provider(
             groq_service_tier=groq_service_tier,
             openai_service_tier=openai_service_tier,
             extra_body=extra_body,
+            timeout=timeout,
         )
 
     else:
@@ -515,6 +527,10 @@ def __init__(
         vertexai_project_id: str | None = None,
         vertexai_region: str | None = None,
         vertexai_service_account_key: str | None = None,
+        timeout: float | None = None,
+        max_retries: int | None = None,
+        initial_backoff: float | None = None,
+        max_backoff: float | None = None,
     ):
         """
         Initialize LLM provider.
@@ -543,6 +559,17 @@ def __init__(
                 ``"us-central1"`` when ``None``).
             vertexai_service_account_key: Path to a Vertex AI service-account key file for
                 ``provider="vertexai"`` (uses ADC when ``None``).
+            timeout: Per-request LLM timeout in seconds. Resolved by the caller from the
+                per-operation/global config (``retain_llm_timeout`` falling back to
+                ``llm_timeout``, etc.). ``None`` lets each provider apply its own default.
+            max_retries: Default retry-attempt budget for ``call`` / ``call_with_tools``
+                when the per-call argument is omitted. Resolved by the caller from the
+                per-operation/global config (``reflect_llm_max_retries`` falling back to
+                ``llm_max_retries``, etc.). ``None`` keeps each method's own fallback.
+            initial_backoff: Default initial retry backoff (seconds), same resolution as
+                ``max_retries``. ``None`` keeps each method's own fallback.
+            max_backoff: Default maximum retry backoff (seconds), same resolution as
+                ``max_retries``. ``None`` keeps each method's own fallback.
 
         This constructor uses every argument as passed and does not read global
         ``HindsightConfig``: resolving the server-level default for a ``None`` argument is the
@@ -556,6 +583,15 @@ def __init__(
         self.base_url = base_url
         self.model = model
         self.reasoning_effort = reasoning_effort
+        # Per-request timeout (seconds). Used verbatim — the caller resolves the
+        # per-operation/global fallback. ``None`` defers to the provider default.
+        self.timeout = timeout
+        # Default retry policy for call()/call_with_tools(). The caller resolves the
+        # per-operation/global fallback; ``None`` keeps each method's own fallback so
+        # providers built without a resolved config (from_env, tests) are unchanged.
+        self.max_retries = max_retries
+        self.initial_backoff = initial_backoff
+        self.max_backoff = max_backoff
         self.litellmrouter_config = litellmrouter_config
         # Service tiers from hierarchical config (not env vars)
         self.groq_service_tier = groq_service_tier
@@ -706,6 +742,7 @@ def __init__(
             gemini_safety_settings=self.gemini_safety_settings,
             prompt_cache_enabled=self.prompt_cache_enabled,
             litellmrouter_config=router_config,
+            timeout=self.timeout,
         )
 
         # Backward compatibility: Keep mock provider properties
@@ -762,9 +799,9 @@ async def call(
         max_completion_tokens: int | None = None,
         temperature: float | None = None,
         scope: str = "memory",
-        max_retries: int = 10,
-        initial_backoff: float = 1.0,
-        max_backoff: float = 60.0,
+        max_retries: int | None = None,
+        initial_backoff: float | None = None,
+        max_backoff: float | None = None,
         skip_validation: bool = False,
         strict_schema: bool = False,
         return_usage: bool = False,
@@ -779,9 +816,12 @@ async def call(
             max_completion_tokens: Maximum tokens in response.
             temperature: Sampling temperature (0.0-2.0).
             scope: Scope identifier for tracking.
-            max_retries: Maximum retry attempts.
-            initial_backoff: Initial backoff time in seconds.
-            max_backoff: Maximum backoff time in seconds.
+            max_retries: Maximum retry attempts. ``None`` uses the provider's configured
+                default (per-operation/global ``llm_max_retries``), else 10.
+            initial_backoff: Initial backoff time in seconds. ``None`` uses the provider's
+                configured default (``llm_initial_backoff``), else 1.0.
+            max_backoff: Maximum backoff time in seconds. ``None`` uses the provider's
+                configured default (``llm_max_backoff``), else 60.0.
             skip_validation: Return raw JSON without Pydantic validation.
             strict_schema: Per-call override requesting grammar-enforced (json_schema strict)
                 structured output instead of the soft json_object path. The server-level
@@ -806,6 +846,20 @@ async def call(
         structured = "+structured" if response_format is not None else ""
         set_stage(f"llm.{self.provider}.{scope}{structured}")
 
+        # Resolve the retry policy: explicit per-call arg wins, else the provider's
+        # configured per-operation/global default, else this method's own fallback.
+        max_retries = (
+            max_retries if max_retries is not None else (self.max_retries if self.max_retries is not None else 10)
+        )
+        initial_backoff = (
+            initial_backoff
+            if initial_backoff is not None
+            else (self.initial_backoff if self.initial_backoff is not None else 1.0)
+        )
+        max_backoff = (
+            max_backoff if max_backoff is not None else (self.max_backoff if self.max_backoff is not None else 60.0)
+        )
+
         # Resolve strict-schema once, here, rather than in each provider: the
         # per-call argument OR the server-level HINDSIGHT_API_LLM_STRICT_SCHEMA
         # flag. Providers with a json_schema response_format (OpenAI-compatible,
@@ -908,9 +962,9 @@ async def call_with_tools(
         max_completion_tokens: int | None = None,
         temperature: float | None = None,
         scope: str = "tools",
-        max_retries: int = 5,
-        initial_backoff: float = 1.0,
-        max_backoff: float = 30.0,
+        max_retries: int | None = None,
+        initial_backoff: float | None = None,
+        max_backoff: float | None = None,
         tool_choice: str | dict[str, Any] = "auto",
         cached_prefix: str | None = None,
     ) -> "LLMToolCallResult":
@@ -923,9 +977,12 @@ async def call_with_tools(
             max_completion_tokens: Maximum tokens in response.
             temperature: Sampling temperature (0.0-2.0).
             scope: Scope identifier for tracking.
-            max_retries: Maximum retry attempts.
-            initial_backoff: Initial backoff time in seconds.
-            max_backoff: Maximum backoff time in seconds.
+            max_retries: Maximum retry attempts. ``None`` uses the provider's configured
+                default (per-operation/global ``llm_max_retries``), else 5.
+            initial_backoff: Initial backoff time in seconds. ``None`` uses the provider's
+                configured default (``llm_initial_backoff``), else 1.0.
+            max_backoff: Maximum backoff time in seconds. ``None`` uses the provider's
+                configured default (``llm_max_backoff``), else 30.0.
             tool_choice: How to choose tools - "auto", "none", "required", or {"type": "function", "function": {"name": "..."}}
 
         Returns:
@@ -935,6 +992,20 @@ async def call_with_tools(
 
         set_stage(f"llm.{self.provider}.{scope}+tools")
 
+        # Resolve the retry policy: explicit per-call arg wins, else the provider's
+        # configured per-operation/global default, else this method's own fallback.
+        max_retries = (
+            max_retries if max_retries is not None else (self.max_retries if self.max_retries is not None else 5)
+        )
+        initial_backoff = (
+            initial_backoff
+            if initial_backoff is not None
+            else (self.initial_backoff if self.initial_backoff is not None else 1.0)
+        )
+        max_backoff = (
+            max_backoff if max_backoff is not None else (self.max_backoff if self.max_backoff is not None else 30.0)
+        )
+
         # Failures forwarded to the GenAI recorder; successes recorded by providers.
         from ..tracing import get_span_recorder
         from .llm_trace import (
@@ -1178,6 +1249,7 @@ def from_env(cls) -> "LLMProvider":
             DEFAULT_LLM_PROMPT_CACHE_ENABLED,
             DEFAULT_LLM_PROVIDER,
             DEFAULT_LLM_REASONING_EFFORT,
+            DEFAULT_LLM_TIMEOUT,
             ENV_LLM_API_KEY,
             ENV_LLM_BASE_URL,
             ENV_LLM_BEDROCK_SERVICE_TIER,
@@ -1192,6 +1264,7 @@ def from_env(cls) -> "LLMProvider":
             ENV_LLM_PROMPT_CACHE_ENABLED,
             ENV_LLM_PROVIDER,
             ENV_LLM_REASONING_EFFORT,
+            ENV_LLM_TIMEOUT,
             ENV_LLM_VERTEXAI_PROJECT_ID,
             ENV_LLM_VERTEXAI_REGION,
             ENV_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY,
@@ -1245,6 +1318,7 @@ def from_env(cls) -> "LLMProvider":
             vertexai_project_id=os.getenv(ENV_LLM_VERTEXAI_PROJECT_ID) or None,
             vertexai_region=os.getenv(ENV_LLM_VERTEXAI_REGION) or None,
             vertexai_service_account_key=os.getenv(ENV_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY) or None,
+            timeout=float(os.getenv(ENV_LLM_TIMEOUT, str(DEFAULT_LLM_TIMEOUT))),
         )
 
 
 
@@ -388,7 +388,33 @@ def validate_sql_schema(sql: str) -> None:
 RetainOutboxCallbackFactory = Callable[[list[RetainContentDict]], RetainOutboxCallback | None]
 
 
-def _member_to_llm(member: "LLMMemberConfig", config: HindsightConfig) -> LLMConfig:
+@dataclass(frozen=True)
+class _LLMCallDefaults:
+    """An operation's resolved per-request defaults, threaded into every provider
+    of its multi-LLM chain.
+
+    Each field is the effective value after the per-op-override-else-global
+    resolution (e.g. ``retain_llm_timeout`` falling back to ``llm_timeout``). They
+    are carried on the ``LLMProvider`` and used by ``call``/``call_with_tools`` when
+    the per-call argument is omitted — previously these per-op config fields were
+    resolved but never reached the provider (issue #2452).
+    """
+
+    timeout: float | None
+    max_retries: int | None
+    initial_backoff: float | None
+    max_backoff: float | None
+
+    def as_kwargs(self) -> dict[str, Any]:
+        return {
+            "timeout": self.timeout,
+            "max_retries": self.max_retries,
+            "initial_backoff": self.initial_backoff,
+            "max_backoff": self.max_backoff,
+        }
+
+
+def _member_to_llm(member: "LLMMemberConfig", config: HindsightConfig, defaults: _LLMCallDefaults) -> LLMConfig:
     """Build an LLMProvider from one indexed multi-LLM member.
 
     ``LLMProvider`` uses its arguments verbatim (it no longer reads global config),
@@ -397,6 +423,10 @@ def _member_to_llm(member: "LLMMemberConfig", config: HindsightConfig) -> LLMCon
     (``gemini_safety_settings``, ``prompt_cache_enabled``) take the global default.
     ``gemini_safety_settings`` is bank-configurable so it comes from the raw config
     (the proxy blocks it); the per-bank value is applied per-call downstream.
+
+    ``defaults`` are the operation's already-resolved request defaults (timeout +
+    retry policy). Members have no per-member knobs for these, so every member of a
+    chain shares its operation's values.
     """
     from ..config import _get_raw_config
 
@@ -416,13 +446,15 @@ def _member_to_llm(member: "LLMMemberConfig", config: HindsightConfig) -> LLMCon
         vertexai_region=member.vertexai_region or config.llm_vertexai_region,
         vertexai_service_account_key=member.vertexai_service_account_key or config.llm_vertexai_service_account_key,
         litellmrouter_config=member.litellmrouter_config or config.llm_litellmrouter_config,
+        **defaults.as_kwargs(),
     )
 
 
 def _build_llm(
     base: LLMConfig,
     config: HindsightConfig,
     prefix: str,
+    defaults: _LLMCallDefaults,
 ) -> "LLMConfig | MultiLLMProvider":
     """Resolve an operation's multi-LLM chain and wrap ``base`` (member 0) in it.
 
@@ -431,6 +463,9 @@ def _build_llm(
     inherits the global chain, mirroring how per-op base config falls back to the
     global LLM config. Returns ``base`` unchanged when no chain is configured
     (byte-identical hot path).
+
+    ``defaults`` are the operation's resolved request defaults, applied to every
+    fallback member so the whole chain shares the operation's effective settings.
     """
     members: list[LLMMemberConfig] = getattr(config, f"{prefix}llm_members")
     strategy: LLMStrategyConfig | None = getattr(config, f"{prefix}llm_strategy")
@@ -442,7 +477,7 @@ def _build_llm(
 
     if not strategy or not members:
         return base
-    extra = [_member_to_llm(m, config) for m in members]
+    extra = [_member_to_llm(m, config, defaults) for m in members]
     return MultiLLMProvider([base, *extra], strategy)
 
 
@@ -1023,6 +1058,29 @@ def __init__(
 
             self.query_analyzer = DateparserQueryAnalyzer()
 
+        # Resolve each operation's effective per-request defaults: a per-op override
+        # (``HINDSIGHT_API_RETAIN_LLM_TIMEOUT``, ``..._MAX_RETRIES``, ``..._INITIAL_BACKOFF``,
+        # ``..._MAX_BACKOFF``) wins, otherwise the global ``llm_*``. Threaded all the way
+        # into the provider so the configured value actually governs the call (issue #2452);
+        # previously these per-op fields were resolved into config but never reached the
+        # provider, which silently used the global/method default.
+        def _op_defaults(prefix: str) -> _LLMCallDefaults:
+            def pick(field: str) -> Any:
+                per_op = getattr(config, f"{prefix}llm_{field}") if prefix else None
+                return per_op if per_op is not None else getattr(config, f"llm_{field}")
+
+            return _LLMCallDefaults(
+                timeout=pick("timeout"),
+                max_retries=pick("max_retries"),
+                initial_backoff=pick("initial_backoff"),
+                max_backoff=pick("max_backoff"),
+            )
+
+        default_call_defaults = _op_defaults("")
+        retain_call_defaults = _op_defaults("retain_")
+        reflect_call_defaults = _op_defaults("reflect_")
+        consolidation_call_defaults = _op_defaults("consolidation_")
+
         # Initialize LLM configuration (default, used as fallback)
         _default_base_llm = LLMConfig(
             provider=memory_llm_provider,
@@ -1042,8 +1100,9 @@ def __init__(
             vertexai_project_id=config.llm_vertexai_project_id,
             vertexai_region=config.llm_vertexai_region,
             vertexai_service_account_key=config.llm_vertexai_service_account_key,
+            **default_call_defaults.as_kwargs(),
         )
-        self._llm_config = _build_llm(_default_base_llm, config, "")
+        self._llm_config = _build_llm(_default_base_llm, config, "", default_call_defaults)
 
         # Store client and model for convenience (deprecated: use _llm_config.call() instead).
         # Read from the primary member so a multi-LLM chain behaves like the base config here.
@@ -1085,8 +1144,9 @@ def __init__(
             vertexai_project_id=config.llm_vertexai_project_id,
             vertexai_region=config.llm_vertexai_region,
             vertexai_service_account_key=config.llm_vertexai_service_account_key,
+            **retain_call_defaults.as_kwargs(),
         )
-        self._retain_llm_config = _build_llm(_retain_base_llm, config, "retain_")
+        self._retain_llm_config = _build_llm(_retain_base_llm, config, "retain_", retain_call_defaults)
 
         # Reflect LLM config - for think/observe operations (can use lighter models)
         reflect_provider = reflect_llm_provider or config.reflect_llm_provider or memory_llm_provider
@@ -1122,8 +1182,9 @@ def __init__(
             vertexai_project_id=config.llm_vertexai_project_id,
             vertexai_region=config.llm_vertexai_region,
             vertexai_service_account_key=config.llm_vertexai_service_account_key,
+            **reflect_call_defaults.as_kwargs(),
         )
-        self._reflect_llm_config = _build_llm(_reflect_base_llm, config, "reflect_")
+        self._reflect_llm_config = _build_llm(_reflect_base_llm, config, "reflect_", reflect_call_defaults)
 
         # Consolidation LLM config - for mental model consolidation (can use efficient models)
         consolidation_provider = consolidation_llm_provider or config.consolidation_llm_provider or memory_llm_provider
@@ -1159,8 +1220,11 @@ def __init__(
             vertexai_project_id=config.llm_vertexai_project_id,
             vertexai_region=config.llm_vertexai_region,
             vertexai_service_account_key=config.llm_vertexai_service_account_key,
+            **consolidation_call_defaults.as_kwargs(),
+        )
+        self._consolidation_llm_config = _build_llm(
+            _consolidation_base_llm, config, "consolidation_", consolidation_call_defaults
         )
-        self._consolidation_llm_config = _build_llm(_consolidation_base_llm, config, "consolidation_")
 
         # Initialize cross-encoder reranker (cached for performance)
         self._cross_encoder_reranker = CrossEncoderReranker(cross_encoder=cross_encoder)