@@ -76,6 +76,12 @@ async def _fetch_trace_list_with_retry(
7676) -> Any :
7777 """Fetch trace list with rate limit retry logic."""
7878 list_retries = 0
79+ rollout_id : Optional [str ] = None
80+ if tags :
81+ for t in tags :
82+ if isinstance (t , str ) and t .startswith ("rollout_id:" ):
83+ rollout_id = t .split (":" , 1 )[1 ] if ":" in t else t
84+ break
7985 while list_retries < max_retries :
8086 try :
8187 traces = langfuse_client .api .trace .list (
@@ -101,22 +107,34 @@ async def _fetch_trace_list_with_retry(
101107 return traces
102108 except Exception as e :
103109 list_retries += 1
104- if list_retries < max_retries and ("429" in str (e ) or "Empty results" in str (e )):
105- sleep_time = 2 ** list_retries # Exponential backoff for rate limits
110+ is_rate_limit_or_empty = "429" in str (e ) or "Empty results" in str (e )
111+ is_timeout = "timed out" in str (e ) or "Read timed out" in str (e )
112+
113+ if list_retries < max_retries and (is_rate_limit_or_empty or is_timeout ):
114+ sleep_time = 2 ** list_retries
106115 logger .warning (
107- "Retrying trace.list in %ds (attempt %d/%d): %s" , sleep_time , list_retries , max_retries , str (e )
116+ "Retrying trace.list in %ds (attempt %d/%d): %s" ,
117+ sleep_time ,
118+ list_retries ,
119+ max_retries ,
120+ str (e ),
108121 )
109122 await asyncio .sleep (sleep_time )
110123 elif list_retries == max_retries :
111124 # Return 404 if we've retried max_retries
112125 # TODO: write some tests around proxy exception handling
113- logger .error ("Failed to fetch trace list after %d retries: %s" , max_retries , e )
126+ logger .error (
127+ "Failed to fetch trace list after %d retries (rollout_id=%s): %s" ,
128+ max_retries ,
129+ rollout_id ,
130+ e ,
131+ )
114132 raise HTTPException (
115133 status_code = 404 , detail = f"Failed to fetch traces after { max_retries } retries: { str (e )} "
116134 )
117135 else :
118136 # Catch all other exceptions
119- logger .error ("Failed to fetch trace list: %s" , e )
137+ logger .error ("Failed to fetch trace list (rollout_id=%s) : %s" , rollout_id , e )
120138 raise HTTPException (status_code = 500 , detail = f"Failed to fetch traces: { str (e )} " )
121139
122140
0 commit comments