feat: make LLM backoff configurable end-to-end

SohamKukreti · SohamKukreti · commit 7a133e22ccc2 · 2025-11-28T18:50:04.000+05:30
- extend LLMConfig with backoff delay/attempt/factor fields and thread them
  through LLMExtractionStrategy, LLMContentFilter, table extraction, and
  Docker API handlers
- expose the backoff parameter knobs on perform_completion_with_backoff/aperform_completion_with_backoff
  and document them in the md_v2 guides
diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
@@ -1792,7 +1792,10 @@ def __init__(
         frequency_penalty: Optional[float] = None,
         presence_penalty: Optional[float] = None,
         stop: Optional[List[str]] = None,
-        n: Optional[int] = None,    
+        n: Optional[int] = None,
+        backoff_base_delay: Optional[int] = None,
+        backoff_max_attempts: Optional[int] = None,
+        backoff_exponential_factor: Optional[int] = None,
     ):
         """Configuaration class for LLM provider and API token."""
         self.provider = provider
@@ -1821,6 +1824,9 @@ def __init__(
         self.presence_penalty = presence_penalty
         self.stop = stop
         self.n = n
+        self.backoff_base_delay = backoff_base_delay if backoff_base_delay is not None else 2
+        self.backoff_max_attempts = backoff_max_attempts if backoff_max_attempts is not None else 3
+        self.backoff_exponential_factor = backoff_exponential_factor if backoff_exponential_factor is not None else 2
 
     @staticmethod
     def from_kwargs(kwargs: dict) -> "LLMConfig":
@@ -1834,7 +1840,10 @@ def from_kwargs(kwargs: dict) -> "LLMConfig":
             frequency_penalty=kwargs.get("frequency_penalty"),
             presence_penalty=kwargs.get("presence_penalty"),
             stop=kwargs.get("stop"),
-            n=kwargs.get("n")
+            n=kwargs.get("n"),
+            backoff_base_delay=kwargs.get("backoff_base_delay"),
+            backoff_max_attempts=kwargs.get("backoff_max_attempts"),
+            backoff_exponential_factor=kwargs.get("backoff_exponential_factor")
         )
 
     def to_dict(self):
@@ -1848,7 +1857,10 @@ def to_dict(self):
             "frequency_penalty": self.frequency_penalty,
             "presence_penalty": self.presence_penalty,
             "stop": self.stop,
-            "n": self.n
+            "n": self.n,
+            "backoff_base_delay": self.backoff_base_delay,
+            "backoff_max_attempts": self.backoff_max_attempts,
+            "backoff_exponential_factor": self.backoff_exponential_factor
         }
 
     def clone(self, **kwargs):
diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py
@@ -980,6 +980,9 @@ def _proceed_with_chunk(
                         prompt,
                         api_token,
                         base_url=base_url,
+                        base_delay=self.llm_config.backoff_base_delay,
+                        max_attempts=self.llm_config.backoff_max_attempts,
+                        exponential_factor=self.llm_config.backoff_exponential_factor,
                         extra_args=extra_args,
                     )
 
diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
@@ -649,6 +649,9 @@ def extract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]:
                 base_url=self.llm_config.base_url,
                 json_response=self.force_json_response,
                 extra_args=self.extra_args,
+                base_delay=self.llm_config.backoff_base_delay,
+                max_attempts=self.llm_config.backoff_max_attempts,
+                exponential_factor=self.llm_config.backoff_exponential_factor
             )  # , json_response=self.extract_type == "schema")
             # Track usage
             usage = TokenUsage(
@@ -846,6 +849,9 @@ async def aextract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]:
                 base_url=self.llm_config.base_url,
                 json_response=self.force_json_response,
                 extra_args=self.extra_args,
+                base_delay=self.llm_config.backoff_base_delay,
+                max_attempts=self.llm_config.backoff_max_attempts,
+                exponential_factor=self.llm_config.backoff_exponential_factor
             )
             # Track usage
             usage = TokenUsage(
diff --git a/crawl4ai/table_extraction.py b/crawl4ai/table_extraction.py
@@ -795,6 +795,9 @@ def extract_tables(self, element: etree.Element, **kwargs) -> List[Dict[str, Any
                     api_token=self.llm_config.api_token,
                     base_url=self.llm_config.base_url,
                     json_response=True,
+                    base_delay=self.llm_config.backoff_base_delay,
+                    max_attempts=self.llm_config.backoff_max_attempts,
+                    exponential_factor=self.llm_config.backoff_exponential_factor,
                     extra_args=self.extra_args
                 )
                 
@@ -1116,6 +1119,9 @@ def _process_chunk(self, chunk_html: str, chunk_index: int, total_chunks: int, h
                     api_token=self.llm_config.api_token,
                     base_url=self.llm_config.base_url,
                     json_response=True,
+                    base_delay=self.llm_config.backoff_base_delay,
+                    max_attempts=self.llm_config.backoff_max_attempts,
+                    exponential_factor=self.llm_config.backoff_exponential_factor,
                     extra_args=self.extra_args
                 )
                 
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
@@ -1745,6 +1745,9 @@ def perform_completion_with_backoff(
     api_token,
     json_response=False,
     base_url=None,
+    base_delay=2,
+    max_attempts=3,
+    exponential_factor=2,
     **kwargs,
 ):
     """
@@ -1761,6 +1764,9 @@ def perform_completion_with_backoff(
         api_token (str): The API token for authentication.
         json_response (bool): Whether to request a JSON response. Defaults to False.
         base_url (Optional[str]): The base URL for the API. Defaults to None.
+        base_delay (int): The base delay in seconds. Defaults to 2.
+        max_attempts (int): The maximum number of attempts. Defaults to 3.
+        exponential_factor (int): The exponential factor. Defaults to 2.
         **kwargs: Additional arguments for the API request.
 
     Returns:
@@ -1770,9 +1776,6 @@ def perform_completion_with_backoff(
     from litellm import completion
     from litellm.exceptions import RateLimitError
 
-    max_attempts = 3
-    base_delay = 2  # Base delay in seconds, you can adjust this based on your needs
-
     extra_args = {"temperature": 0.01, "api_key": api_token, "base_url": base_url}
     if json_response:
         extra_args["response_format"] = {"type": "json_object"}
@@ -1798,7 +1801,7 @@ def perform_completion_with_backoff(
             # Check if we have exhausted our max attempts
             if attempt < max_attempts - 1:
                 # Calculate the delay and wait
-                delay = base_delay * (2**attempt)  # Exponential backoff formula
+                delay = base_delay * (exponential_factor**attempt)  # Exponential backoff formula
                 print(f"Waiting for {delay} seconds before retrying...")
                 time.sleep(delay)
             else:
@@ -1831,6 +1834,9 @@ async def aperform_completion_with_backoff(
     api_token,
     json_response=False,
     base_url=None,
+    base_delay=2,
+    max_attempts=3,
+    exponential_factor=2,
     **kwargs,
 ):
     """
@@ -1847,6 +1853,9 @@ async def aperform_completion_with_backoff(
         api_token (str): The API token for authentication.
         json_response (bool): Whether to request a JSON response. Defaults to False.
         base_url (Optional[str]): The base URL for the API. Defaults to None.
+        base_delay (int): The base delay in seconds. Defaults to 2.
+        max_attempts (int): The maximum number of attempts. Defaults to 3.
+        exponential_factor (int): The exponential factor. Defaults to 2.
         **kwargs: Additional arguments for the API request.
 
     Returns:
@@ -1857,9 +1866,6 @@ async def aperform_completion_with_backoff(
     from litellm.exceptions import RateLimitError
     import asyncio
 
-    max_attempts = 3
-    base_delay = 2  # Base delay in seconds, you can adjust this based on your needs
-
     extra_args = {"temperature": 0.01, "api_key": api_token, "base_url": base_url}
     if json_response:
         extra_args["response_format"] = {"type": "json_object"}
@@ -1885,7 +1891,7 @@ async def aperform_completion_with_backoff(
             # Check if we have exhausted our max attempts
             if attempt < max_attempts - 1:
                 # Calculate the delay and wait
-                delay = base_delay * (2**attempt)  # Exponential backoff formula
+                delay = base_delay * (exponential_factor**attempt)  # Exponential backoff formula
                 print(f"Waiting for {delay} seconds before retrying...")
                 await asyncio.sleep(delay)
             else:
diff --git a/deploy/docker/api.py b/deploy/docker/api.py
@@ -108,7 +108,10 @@ async def handle_llm_qa(
             prompt_with_variables=prompt,
             api_token=get_llm_api_key(config),  # Returns None to let litellm handle it
             temperature=get_llm_temperature(config),
-            base_url=get_llm_base_url(config)
+            base_url=get_llm_base_url(config),
+            base_delay=config["llm"].get("backoff_base_delay", 2),
+            max_attempts=config["llm"].get("backoff_max_attempts", 3),
+            exponential_factor=config["llm"].get("backoff_exponential_factor", 2)
         )
 
         return response.choices[0].message.content
diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md
@@ -439,10 +439,19 @@ LLMConfig is useful to pass LLM provider config to strategies and functions that
 | **`provider`**    | `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)* | Which LLM provider to use. 
 | **`api_token`**         |1.Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables  <br/> 2. API token of LLM provider <br/> eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"` <br/> 3. Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"`              | API token to use for the given provider 
 | **`base_url`**         |Optional. Custom API endpoint | If your provider has a custom endpoint
+| **`backoff_base_delay`** |Optional. `int` *(default: `2`)* | Seconds to wait before the first retry when the provider throttles a request.
+| **`backoff_max_attempts`** |Optional. `int` *(default: `3`)* | Total tries (initial call + retries) before surfacing an error.
+| **`backoff_exponential_factor`** |Optional. `int` *(default: `2`)* | Multiplier that increases the wait time for each retry (`delay = base_delay * factor^attempt`).
 
 ## 3.2 Example Usage
 ```python
-llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
+llm_config = LLMConfig(
+    provider="openai/gpt-4o-mini",
+    api_token=os.getenv("OPENAI_API_KEY"),
+    backoff_base_delay=1, # optional
+    backoff_max_attempts=5, # optional
+    backoff_exponential_factor=3, # optional
+)
 ```
 
 ## 4. Putting It All Together
diff --git a/docs/md_v2/complete-sdk-reference.md b/docs/md_v2/complete-sdk-reference.md
@@ -1593,8 +1593,20 @@ The `clone()` method:
     - Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"`            
 3. **`base_url`**:  
    - If your provider has a custom endpoint
-```python
-llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
+
+4. **Backoff controls** *(optional)*:  
+   - `backoff_base_delay` *(default `2` seconds)* – how long to pause before the first retry if the provider rate-limits you.  
+   - `backoff_max_attempts` *(default `3`)* – total tries for the same prompt (initial call + retries).  
+   - `backoff_exponential_factor` *(default `2`)* – how quickly the pause grows between retries. A factor of 2 yields waits like 2s → 4s → 8s.  
+   - Because these plug into Crawl4AI’s retry helper, every LLM strategy automatically follows the pacing you define here.
+```python
+llm_config = LLMConfig(
+    provider="openai/gpt-4o-mini",
+    api_token=os.getenv("OPENAI_API_KEY"),
+    backoff_base_delay=1, # optional
+    backoff_max_attempts=5, # optional
+    backoff_exponential_factor=3, # optional
+)
 ```
 ## 4. Putting It All Together
 In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each call's needs:
diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md
@@ -308,8 +308,20 @@ The `clone()` method:
 3.⠀**`base_url`**:  
    - If your provider has a custom endpoint
 
+4.⠀**Retry/backoff controls** *(optional)*:  
+   - `backoff_base_delay` *(default `2` seconds)* – base delay inserted before the first retry when the provider returns a rate-limit response.  
+   - `backoff_max_attempts` *(default `3`)* – total number of attempts (initial call plus retries) before the request is surfaced as an error.  
+   - `backoff_exponential_factor` *(default `2`)* – growth rate for the retry delay (`delay = base_delay * factor^attempt`).  
+   - These values are forwarded to the shared `perform_completion_with_backoff` helper, ensuring every strategy that consumes your `LLMConfig` honors the same throttling policy.
+
 ```python
-llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
+llm_config = LLMConfig(
+    provider="openai/gpt-4o-mini",
+    api_token=os.getenv("OPENAI_API_KEY"),
+    backoff_base_delay=1, # optional
+    backoff_max_attempts=5, # optional
+    backoff_exponential_factor=3, #optional
+)
 ```
 
 ## 4. Putting It All Together

Original file line number	Diff line number	Diff line change
`@@ -980,6 +980,9 @@ def _proceed_with_chunk(`
`980`	`980`	`prompt,`
`981`	`981`	`api_token,`
`982`	`982`	`base_url=base_url,`
	`983`	`+ base_delay=self.llm_config.backoff_base_delay,`
	`984`	`+ max_attempts=self.llm_config.backoff_max_attempts,`
	`985`	`+ exponential_factor=self.llm_config.backoff_exponential_factor,`
`983`	`986`	`extra_args=extra_args,`
`984`	`987`	`)`
`985`	`988`