fix(llm): thread per-scope LLM timeout into constructed providers

r266-tech · r266-tech · commit 146819b9410d · 2026-06-30T17:17:04.000+08:00
The retain/reflect/consolidation LLM timeouts were resolved into config but never passed to the provider, so they were silently ignored and every call fell back to the global HINDSIGHT_API_LLM_TIMEOUT / 120s default. Thread timeout through create_llm_provider and LLMProvider into the LiteLLM (and bedrock-alias) constructions, and pass the resolved per-op value from the MemoryEngine builds (global llm_timeout fallback). Add threading tests. Fixes #2452
diff --git a/hindsight-api-slim/hindsight_api/engine/llm_wrapper.py b/hindsight-api-slim/hindsight_api/engine/llm_wrapper.py
@@ -253,6 +253,7 @@ def create_llm_provider(
     prompt_cache_enabled: bool = False,
     litellmrouter_config: dict[str, Any] | None = None,
     gemini_service_tier: str | None = None,
+    timeout: float | None = None,
 ) -> Any:  # Returns LLMInterface
     """
     Factory function to create the appropriate LLM provider implementation.
@@ -267,6 +268,9 @@ def create_llm_provider(
         openai_service_tier: OpenAI service tier (for OpenAI provider) - None (default) or "flex" (50% cheaper).
         bedrock_service_tier: Bedrock service tier (for Bedrock provider) - None (default), "flex", "priority", or "reserved".
         gemini_service_tier: Gemini service tier (for Gemini provider) - None (default) or "flex" (50% cheaper).
+        timeout: Per-call hard timeout (seconds) for the constructed provider. Threaded into LiteLLM
+            (incl. the ``bedrock/`` alias), which wraps each request in ``asyncio.wait_for(timeout)``.
+            ``None`` lets the provider fall back to ``HINDSIGHT_API_LLM_TIMEOUT`` / the built-in default.
         extra_body: Extra request-body params merged into the provider's native
             call. Threaded into OpenAI-compatible, Fireworks, Anthropic, Gemini/
             VertexAI and LiteLLM providers (each merges them in its own parameter
@@ -378,6 +382,7 @@ def create_llm_provider(
             reasoning_effort=reasoning_effort,
             extra_body=extra_body,
             default_headers=default_headers,
+            timeout=timeout,
         )
 
     elif provider_lower == "litellmrouter":
@@ -411,6 +416,7 @@ def create_llm_provider(
             extra_body=extra_body,
             default_headers=default_headers,
             bedrock_service_tier=bedrock_service_tier,
+            timeout=timeout,
         )
 
     elif provider_lower == "llamacpp":
@@ -503,6 +509,7 @@ def __init__(
         base_url: str,
         model: str,
         reasoning_effort: str = "low",
+        timeout: float | None = None,
         groq_service_tier: str | None = None,
         openai_service_tier: str | None = None,
         bedrock_service_tier: str | None = None,
@@ -525,6 +532,9 @@ def __init__(
             base_url: Base URL for the API.
             model: Model name.
             reasoning_effort: Reasoning effort level for supported providers.
+            timeout: Per-call hard timeout (seconds) for this provider. Resolved by the caller from
+                the per-op config (``retain``/``reflect``/``consolidation``) with the global
+                ``llm_timeout`` fallback; ``None`` defers to the provider's own env/default.
             groq_service_tier: Groq service tier ("on_demand", "flex", "auto") - from config.
             openai_service_tier: OpenAI service tier (None or "flex") - from config.
             bedrock_service_tier: Bedrock service tier (None, "flex", "priority", "reserved") - from config.
@@ -556,6 +566,7 @@ def __init__(
         self.base_url = base_url
         self.model = model
         self.reasoning_effort = reasoning_effort
+        self.timeout = timeout
         self.litellmrouter_config = litellmrouter_config
         # Service tiers from hierarchical config (not env vars)
         self.groq_service_tier = groq_service_tier
@@ -694,6 +705,7 @@ def __init__(
             base_url=self.base_url,
             model=self.model,
             reasoning_effort=self.reasoning_effort,
+            timeout=self.timeout,
             groq_service_tier=self.groq_service_tier,
             openai_service_tier=self.openai_service_tier,
             bedrock_service_tier=self.bedrock_service_tier,
@@ -1178,6 +1190,7 @@ def from_env(cls) -> "LLMProvider":
             DEFAULT_LLM_PROMPT_CACHE_ENABLED,
             DEFAULT_LLM_PROVIDER,
             DEFAULT_LLM_REASONING_EFFORT,
+            DEFAULT_LLM_TIMEOUT,
             ENV_LLM_API_KEY,
             ENV_LLM_BASE_URL,
             ENV_LLM_BEDROCK_SERVICE_TIER,
@@ -1192,6 +1205,7 @@ def from_env(cls) -> "LLMProvider":
             ENV_LLM_PROMPT_CACHE_ENABLED,
             ENV_LLM_PROVIDER,
             ENV_LLM_REASONING_EFFORT,
+            ENV_LLM_TIMEOUT,
             ENV_LLM_VERTEXAI_PROJECT_ID,
             ENV_LLM_VERTEXAI_REGION,
             ENV_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY,
@@ -1229,6 +1243,7 @@ def from_env(cls) -> "LLMProvider":
             base_url=base_url,
             model=model,
             reasoning_effort=os.getenv(ENV_LLM_REASONING_EFFORT, DEFAULT_LLM_REASONING_EFFORT),
+            timeout=float(os.getenv(ENV_LLM_TIMEOUT, str(DEFAULT_LLM_TIMEOUT))),
             extra_body=extra_body,
             default_headers=default_headers,
             groq_service_tier=os.getenv(ENV_LLM_GROQ_SERVICE_TIER, DEFAULT_LLM_GROQ_SERVICE_TIER),
diff --git a/hindsight-api-slim/hindsight_api/engine/memory_engine.py b/hindsight-api-slim/hindsight_api/engine/memory_engine.py
@@ -406,6 +406,7 @@ def _member_to_llm(member: "LLMMemberConfig", config: HindsightConfig) -> LLMCon
         base_url=member.base_url,
         model=member.model,
         reasoning_effort=member.reasoning_effort or config.llm_reasoning_effort,
+        timeout=config.llm_timeout,
         extra_body=member.extra_body,
         default_headers=member.default_headers or config.llm_default_headers,
         bedrock_service_tier=member.bedrock_service_tier,
@@ -1030,6 +1031,7 @@ def __init__(
             base_url=memory_llm_base_url,
             model=memory_llm_model,
             reasoning_effort=config.llm_reasoning_effort,
+            timeout=config.llm_timeout,
             extra_body=config.llm_extra_body,
             default_headers=config.llm_default_headers,
             litellmrouter_config=config.llm_litellmrouter_config,
@@ -1073,6 +1075,7 @@ def __init__(
             base_url=retain_base_url,
             model=retain_model,
             reasoning_effort=config.llm_reasoning_effort,
+            timeout=config.retain_llm_timeout if config.retain_llm_timeout is not None else config.llm_timeout,
             extra_body=config.llm_extra_body,
             default_headers=config.llm_default_headers,
             litellmrouter_config=config.retain_llm_litellmrouter_config or config.llm_litellmrouter_config,
@@ -1110,6 +1113,7 @@ def __init__(
             base_url=reflect_base_url,
             model=reflect_model,
             reasoning_effort=config.llm_reasoning_effort,
+            timeout=config.reflect_llm_timeout if config.reflect_llm_timeout is not None else config.llm_timeout,
             extra_body=config.llm_extra_body,
             default_headers=config.llm_default_headers,
             litellmrouter_config=config.reflect_llm_litellmrouter_config or config.llm_litellmrouter_config,
@@ -1147,6 +1151,9 @@ def __init__(
             base_url=consolidation_base_url,
             model=consolidation_model,
             reasoning_effort=config.llm_reasoning_effort,
+            timeout=config.consolidation_llm_timeout
+            if config.consolidation_llm_timeout is not None
+            else config.llm_timeout,
             extra_body=config.llm_extra_body,
             default_headers=config.llm_default_headers,
             litellmrouter_config=config.consolidation_llm_litellmrouter_config or config.llm_litellmrouter_config,
diff --git a/hindsight-api-slim/tests/test_llm_timeout_threading.py b/hindsight-api-slim/tests/test_llm_timeout_threading.py
@@ -0,0 +1,69 @@
+"""Plumbing tests for the per-scope LLM timeout (HINDSIGHT_API_{RETAIN,REFLECT,CONSOLIDATION}_LLM_TIMEOUT).
+
+These assert the *wiring* — that a configured timeout actually reaches the
+constructed provider — rather than just that the env var parses into config
+(covered by test_config_validation.py). The value is threaded
+config -> LLMProvider -> create_llm_provider -> LiteLLMLLM, and LiteLLMLLM wraps
+each request in ``asyncio.wait_for(timeout)`` (see test_litellm_timeout.py).
+
+Before this wiring, the per-op timeouts were resolved into config but never passed
+to the provider, so ``retain``/``reflect``/``consolidation`` calls silently fell
+back to the global ``HINDSIGHT_API_LLM_TIMEOUT`` default and ignored the
+operator-set value (issue #2452). If MemoryEngine ever stops passing ``timeout``
+through, the per-op knob goes inert again — these checks guard that bridge.
+"""
+
+from hindsight_api.config import DEFAULT_LLM_TIMEOUT, ENV_LLM_TIMEOUT
+from hindsight_api.engine.llm_wrapper import LLMConfig, create_llm_provider
+
+_LITELLM_MODEL = "litellm_proxy/test-model"
+
+
+def test_llm_config_threads_timeout_to_provider_impl():
+    """End-to-end: LLMConfig -> create_llm_provider -> LiteLLMLLM carries the timeout."""
+    llm = LLMConfig(
+        provider="litellm",
+        api_key="k",
+        base_url="",
+        model=_LITELLM_MODEL,
+        timeout=300.0,
+    )
+    assert llm._provider_impl.timeout == 300.0
+
+
+def test_create_llm_provider_threads_timeout():
+    """The factory forwards ``timeout`` into the LiteLLM provider."""
+    impl = create_llm_provider(
+        provider="litellm",
+        api_key="k",
+        base_url="",
+        model=_LITELLM_MODEL,
+        reasoning_effort="low",
+        timeout=42.0,
+    )
+    assert impl.timeout == 42.0
+
+
+def test_bedrock_alias_threads_timeout():
+    """The ``bedrock/`` LiteLLM alias path also carries the timeout."""
+    llm = LLMConfig(
+        provider="bedrock",
+        api_key="",
+        base_url="",
+        model="us.amazon.nova-2-lite-v1:0",
+        timeout=77.0,
+    )
+    assert llm._provider_impl.timeout == 77.0
+
+
+def test_unset_timeout_falls_back_to_default(monkeypatch):
+    """``None`` (no per-op or global override) resolves to the finite default,
+    never ``None`` — preserving the existing hard-timeout backstop."""
+    monkeypatch.delenv(ENV_LLM_TIMEOUT, raising=False)
+    llm = LLMConfig(
+        provider="litellm",
+        api_key="k",
+        base_url="",
+        model=_LITELLM_MODEL,
+    )
+    assert llm._provider_impl.timeout == DEFAULT_LLM_TIMEOUT