1717
1818# LiteLLM
1919import litellm
20+ from litellm .exceptions import RateLimitError
2021from litellm .utils import ModelResponse
2122
2223# Internal imports
@@ -116,15 +117,27 @@ def _respect_rate_limit(self) -> None:
116117 # Keep only timestamps within the last minute
117118 self ._request_timestamps = [
118119 ts for ts in self ._request_timestamps if current - ts < 60 ]
119- if len (self ._request_timestamps ) < self .rpm_limit :
120+
121+ # Be more conservative - wait if we're at 90% of the limit
122+ conservative_limit = max (1 , int (self .rpm_limit * 0.9 ))
123+
124+ if len (self ._request_timestamps ) < conservative_limit :
120125 return
126+
121127 # Need to wait until the earliest request is outside the 60-second window
122128 earliest = self ._request_timestamps [0 ]
123- # Add a 1s margin to avoid accidental rate limit exceedance
124- sleep_time = 61 - (current - earliest )
129+ # Add a 2s margin to avoid accidental rate limit exceedance
130+ sleep_time = 62 - (current - earliest )
125131 if sleep_time > 0 :
126- logger .warning (f"Rate limit reached | Waiting { sleep_time :.1f} s" )
132+ logger .warning (
133+ f"Rate limit approaching | Requests: { len (self ._request_timestamps )} /{ self .rpm_limit } | "
134+ f"Waiting { sleep_time :.1f} s"
135+ )
127136 time .sleep (sleep_time )
137+ # Clean up old timestamps after waiting
138+ current = time .monotonic ()
139+ self ._request_timestamps = [
140+ ts for ts in self ._request_timestamps if current - ts < 60 ]
128141
129142 @staticmethod
130143 def _strip_code_fences (content : str ) -> str :
@@ -258,9 +271,33 @@ def generate(
258271 if response_format is not None :
259272 completion_params ["response_format" ] = response_format
260273
261- # Call LiteLLM completion with batch messages
262- response : list [ModelResponse ] = litellm .batch_completion (
263- ** completion_params )
274+ # Call LiteLLM completion with batch messages - retry on rate limit
275+ max_retries = 3
276+ retry_delay = 5 # Start with 5 seconds
277+ response = None
278+
279+ for attempt in range (max_retries ):
280+ try :
281+ response : list [ModelResponse ] = litellm .batch_completion (
282+ ** completion_params )
283+ break # Success, exit retry loop
284+ except RateLimitError as e :
285+ if attempt < max_retries - 1 :
286+ wait_time = retry_delay * (2 ** attempt ) # Exponential backoff
287+ logger .warning (
288+ f"Rate limit hit | Provider: { self .provider_name } | Model: { self .model_id } | "
289+ f"Attempt { attempt + 1 } /{ max_retries } | Waiting { wait_time } s before retry"
290+ )
291+ time .sleep (wait_time )
292+ else :
293+ logger .error (
294+ f"Rate limit exceeded after { max_retries } attempts | "
295+ f"Provider: { self .provider_name } | Model: { self .model_id } "
296+ )
297+ raise
298+
299+ if response is None :
300+ raise RuntimeError ("Failed to get response after retries" )
264301
265302 # Record timestamp for rate limiting (one timestamp per batch item)
266303 if self .rpm_limit is not None :
@@ -270,7 +307,24 @@ def generate(
270307
271308 # Extract content from each response
272309 results = []
273- for one_response in response :
310+ for idx , one_response in enumerate (response ):
311+ if isinstance (one_response , Exception ):
312+ if isinstance (one_response , RateLimitError ):
313+ logger .warning (
314+ "Rate limit error in batch item | Provider: %s | Model: %s | Item: %d" ,
315+ self .provider_name ,
316+ self .model_id ,
317+ idx ,
318+ )
319+ raise RuntimeError (
320+ f"Batch item { idx } failed during generation: { one_response } "
321+ ) from one_response
322+
323+ if not getattr (one_response , "choices" , None ):
324+ raise RuntimeError (
325+ f"Unexpected response type from LiteLLM batch completion at item { idx } : { type (one_response ).__name__ } "
326+ )
327+
274328 content = one_response .choices [0 ].message .content
275329
276330 if response_format is not None :
0 commit comments