@@ -253,6 +253,7 @@ def create_llm_provider(
253253 prompt_cache_enabled : bool = False ,
254254 litellmrouter_config : dict [str , Any ] | None = None ,
255255 gemini_service_tier : str | None = None ,
256+ timeout : float | None = None ,
256257) -> Any : # Returns LLMInterface
257258 """
258259 Factory function to create the appropriate LLM provider implementation.
@@ -280,6 +281,12 @@ def create_llm_provider(
280281 vertexai_project_id: Vertex AI project ID (for VertexAI provider).
281282 vertexai_region: Vertex AI region (for VertexAI provider).
282283 vertexai_credentials: Vertex AI credentials object (for VertexAI provider).
284+ timeout: Per-request LLM timeout in seconds (resolved by the caller from the
285+ per-operation/global config). Threaded into the providers that honour a
286+ configurable request timeout (LiteLLM, LiteLLM Router, OpenAI-compatible,
287+ Nous). ``None`` lets each provider fall back to its own default
288+ (``HINDSIGHT_API_LLM_TIMEOUT`` / ``DEFAULT_LLM_TIMEOUT`` for those four;
289+ Anthropic and Gemini keep their provider-specific defaults).
283290
284291 Returns:
285292 LLMInterface implementation for the specified provider.
@@ -378,6 +385,7 @@ def create_llm_provider(
378385 reasoning_effort = reasoning_effort ,
379386 extra_body = extra_body ,
380387 default_headers = default_headers ,
388+ timeout = timeout ,
381389 )
382390
383391 elif provider_lower == "litellmrouter" :
@@ -397,6 +405,7 @@ def create_llm_provider(
397405 reasoning_effort = reasoning_effort ,
398406 extra_body = extra_body ,
399407 default_headers = default_headers ,
408+ timeout = timeout ,
400409 )
401410
402411 elif provider_lower == "bedrock" :
@@ -411,6 +420,7 @@ def create_llm_provider(
411420 extra_body = extra_body ,
412421 default_headers = default_headers ,
413422 bedrock_service_tier = bedrock_service_tier ,
423+ timeout = timeout ,
414424 )
415425
416426 elif provider_lower == "llamacpp" :
@@ -457,6 +467,7 @@ def create_llm_provider(
457467 model = model ,
458468 reasoning_effort = reasoning_effort ,
459469 extra_body = extra_body ,
470+ timeout = timeout ,
460471 )
461472
462473 elif provider_lower in (
@@ -483,6 +494,7 @@ def create_llm_provider(
483494 groq_service_tier = groq_service_tier ,
484495 openai_service_tier = openai_service_tier ,
485496 extra_body = extra_body ,
497+ timeout = timeout ,
486498 )
487499
488500 else :
@@ -515,6 +527,10 @@ def __init__(
515527 vertexai_project_id : str | None = None ,
516528 vertexai_region : str | None = None ,
517529 vertexai_service_account_key : str | None = None ,
530+ timeout : float | None = None ,
531+ max_retries : int | None = None ,
532+ initial_backoff : float | None = None ,
533+ max_backoff : float | None = None ,
518534 ):
519535 """
520536 Initialize LLM provider.
@@ -543,6 +559,17 @@ def __init__(
543559 ``"us-central1"`` when ``None``).
544560 vertexai_service_account_key: Path to a Vertex AI service-account key file for
545561 ``provider="vertexai"`` (uses ADC when ``None``).
562+ timeout: Per-request LLM timeout in seconds. Resolved by the caller from the
563+ per-operation/global config (``retain_llm_timeout`` falling back to
564+ ``llm_timeout``, etc.). ``None`` lets each provider apply its own default.
565+ max_retries: Default retry-attempt budget for ``call`` / ``call_with_tools``
566+ when the per-call argument is omitted. Resolved by the caller from the
567+ per-operation/global config (``reflect_llm_max_retries`` falling back to
568+ ``llm_max_retries``, etc.). ``None`` keeps each method's own fallback.
569+ initial_backoff: Default initial retry backoff (seconds), same resolution as
570+ ``max_retries``. ``None`` keeps each method's own fallback.
571+ max_backoff: Default maximum retry backoff (seconds), same resolution as
572+ ``max_retries``. ``None`` keeps each method's own fallback.
546573
547574 This constructor uses every argument as passed and does not read global
548575 ``HindsightConfig``: resolving the server-level default for a ``None`` argument is the
@@ -556,6 +583,15 @@ def __init__(
556583 self .base_url = base_url
557584 self .model = model
558585 self .reasoning_effort = reasoning_effort
586+ # Per-request timeout (seconds). Used verbatim — the caller resolves the
587+ # per-operation/global fallback. ``None`` defers to the provider default.
588+ self .timeout = timeout
589+ # Default retry policy for call()/call_with_tools(). The caller resolves the
590+ # per-operation/global fallback; ``None`` keeps each method's own fallback so
591+ # providers built without a resolved config (from_env, tests) are unchanged.
592+ self .max_retries = max_retries
593+ self .initial_backoff = initial_backoff
594+ self .max_backoff = max_backoff
559595 self .litellmrouter_config = litellmrouter_config
560596 # Service tiers from hierarchical config (not env vars)
561597 self .groq_service_tier = groq_service_tier
@@ -706,6 +742,7 @@ def __init__(
706742 gemini_safety_settings = self .gemini_safety_settings ,
707743 prompt_cache_enabled = self .prompt_cache_enabled ,
708744 litellmrouter_config = router_config ,
745+ timeout = self .timeout ,
709746 )
710747
711748 # Backward compatibility: Keep mock provider properties
@@ -762,9 +799,9 @@ async def call(
762799 max_completion_tokens : int | None = None ,
763800 temperature : float | None = None ,
764801 scope : str = "memory" ,
765- max_retries : int = 10 ,
766- initial_backoff : float = 1.0 ,
767- max_backoff : float = 60.0 ,
802+ max_retries : int | None = None ,
803+ initial_backoff : float | None = None ,
804+ max_backoff : float | None = None ,
768805 skip_validation : bool = False ,
769806 strict_schema : bool = False ,
770807 return_usage : bool = False ,
@@ -779,9 +816,12 @@ async def call(
779816 max_completion_tokens: Maximum tokens in response.
780817 temperature: Sampling temperature (0.0-2.0).
781818 scope: Scope identifier for tracking.
782- max_retries: Maximum retry attempts.
783- initial_backoff: Initial backoff time in seconds.
784- max_backoff: Maximum backoff time in seconds.
819+ max_retries: Maximum retry attempts. ``None`` uses the provider's configured
820+ default (per-operation/global ``llm_max_retries``), else 10.
821+ initial_backoff: Initial backoff time in seconds. ``None`` uses the provider's
822+ configured default (``llm_initial_backoff``), else 1.0.
823+ max_backoff: Maximum backoff time in seconds. ``None`` uses the provider's
824+ configured default (``llm_max_backoff``), else 60.0.
785825 skip_validation: Return raw JSON without Pydantic validation.
786826 strict_schema: Per-call override requesting grammar-enforced (json_schema strict)
787827 structured output instead of the soft json_object path. The server-level
@@ -806,6 +846,20 @@ async def call(
806846 structured = "+structured" if response_format is not None else ""
807847 set_stage (f"llm.{ self .provider } .{ scope } { structured } " )
808848
849+ # Resolve the retry policy: explicit per-call arg wins, else the provider's
850+ # configured per-operation/global default, else this method's own fallback.
851+ max_retries = (
852+ max_retries if max_retries is not None else (self .max_retries if self .max_retries is not None else 10 )
853+ )
854+ initial_backoff = (
855+ initial_backoff
856+ if initial_backoff is not None
857+ else (self .initial_backoff if self .initial_backoff is not None else 1.0 )
858+ )
859+ max_backoff = (
860+ max_backoff if max_backoff is not None else (self .max_backoff if self .max_backoff is not None else 60.0 )
861+ )
862+
809863 # Resolve strict-schema once, here, rather than in each provider: the
810864 # per-call argument OR the server-level HINDSIGHT_API_LLM_STRICT_SCHEMA
811865 # flag. Providers with a json_schema response_format (OpenAI-compatible,
@@ -908,9 +962,9 @@ async def call_with_tools(
908962 max_completion_tokens : int | None = None ,
909963 temperature : float | None = None ,
910964 scope : str = "tools" ,
911- max_retries : int = 5 ,
912- initial_backoff : float = 1.0 ,
913- max_backoff : float = 30.0 ,
965+ max_retries : int | None = None ,
966+ initial_backoff : float | None = None ,
967+ max_backoff : float | None = None ,
914968 tool_choice : str | dict [str , Any ] = "auto" ,
915969 cached_prefix : str | None = None ,
916970 ) -> "LLMToolCallResult" :
@@ -923,9 +977,12 @@ async def call_with_tools(
923977 max_completion_tokens: Maximum tokens in response.
924978 temperature: Sampling temperature (0.0-2.0).
925979 scope: Scope identifier for tracking.
926- max_retries: Maximum retry attempts.
927- initial_backoff: Initial backoff time in seconds.
928- max_backoff: Maximum backoff time in seconds.
980+ max_retries: Maximum retry attempts. ``None`` uses the provider's configured
981+ default (per-operation/global ``llm_max_retries``), else 5.
982+ initial_backoff: Initial backoff time in seconds. ``None`` uses the provider's
983+ configured default (``llm_initial_backoff``), else 1.0.
984+ max_backoff: Maximum backoff time in seconds. ``None`` uses the provider's
985+ configured default (``llm_max_backoff``), else 30.0.
929986 tool_choice: How to choose tools - "auto", "none", "required", or {"type": "function", "function": {"name": "..."}}
930987
931988 Returns:
@@ -935,6 +992,20 @@ async def call_with_tools(
935992
936993 set_stage (f"llm.{ self .provider } .{ scope } +tools" )
937994
995+ # Resolve the retry policy: explicit per-call arg wins, else the provider's
996+ # configured per-operation/global default, else this method's own fallback.
997+ max_retries = (
998+ max_retries if max_retries is not None else (self .max_retries if self .max_retries is not None else 5 )
999+ )
1000+ initial_backoff = (
1001+ initial_backoff
1002+ if initial_backoff is not None
1003+ else (self .initial_backoff if self .initial_backoff is not None else 1.0 )
1004+ )
1005+ max_backoff = (
1006+ max_backoff if max_backoff is not None else (self .max_backoff if self .max_backoff is not None else 30.0 )
1007+ )
1008+
9381009 # Failures forwarded to the GenAI recorder; successes recorded by providers.
9391010 from ..tracing import get_span_recorder
9401011 from .llm_trace import (
@@ -1178,6 +1249,7 @@ def from_env(cls) -> "LLMProvider":
11781249 DEFAULT_LLM_PROMPT_CACHE_ENABLED ,
11791250 DEFAULT_LLM_PROVIDER ,
11801251 DEFAULT_LLM_REASONING_EFFORT ,
1252+ DEFAULT_LLM_TIMEOUT ,
11811253 ENV_LLM_API_KEY ,
11821254 ENV_LLM_BASE_URL ,
11831255 ENV_LLM_BEDROCK_SERVICE_TIER ,
@@ -1192,6 +1264,7 @@ def from_env(cls) -> "LLMProvider":
11921264 ENV_LLM_PROMPT_CACHE_ENABLED ,
11931265 ENV_LLM_PROVIDER ,
11941266 ENV_LLM_REASONING_EFFORT ,
1267+ ENV_LLM_TIMEOUT ,
11951268 ENV_LLM_VERTEXAI_PROJECT_ID ,
11961269 ENV_LLM_VERTEXAI_REGION ,
11971270 ENV_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY ,
@@ -1245,6 +1318,7 @@ def from_env(cls) -> "LLMProvider":
12451318 vertexai_project_id = os .getenv (ENV_LLM_VERTEXAI_PROJECT_ID ) or None ,
12461319 vertexai_region = os .getenv (ENV_LLM_VERTEXAI_REGION ) or None ,
12471320 vertexai_service_account_key = os .getenv (ENV_LLM_VERTEXAI_SERVICE_ACCOUNT_KEY ) or None ,
1321+ timeout = float (os .getenv (ENV_LLM_TIMEOUT , str (DEFAULT_LLM_TIMEOUT ))),
12481322 )
12491323
12501324
0 commit comments