Skip to content

Commit 14bf073

Browse files
committed
retry and have better logs
1 parent 148beb5 commit 14bf073

1 file changed

Lines changed: 23 additions & 5 deletions

File tree

eval_protocol/proxy/proxy_core/langfuse.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,12 @@ async def _fetch_trace_list_with_retry(
7676
) -> Any:
7777
"""Fetch trace list with rate limit retry logic."""
7878
list_retries = 0
79+
rollout_id: Optional[str] = None
80+
if tags:
81+
for t in tags:
82+
if isinstance(t, str) and t.startswith("rollout_id:"):
83+
rollout_id = t.split(":", 1)[1] if ":" in t else t
84+
break
7985
while list_retries < max_retries:
8086
try:
8187
traces = langfuse_client.api.trace.list(
@@ -101,22 +107,34 @@ async def _fetch_trace_list_with_retry(
101107
return traces
102108
except Exception as e:
103109
list_retries += 1
104-
if list_retries < max_retries and ("429" in str(e) or "Empty results" in str(e)):
105-
sleep_time = 2**list_retries # Exponential backoff for rate limits
110+
is_rate_limit_or_empty = "429" in str(e) or "Empty results" in str(e)
111+
is_timeout = "timed out" in str(e) or "Read timed out" in str(e)
112+
113+
if list_retries < max_retries and (is_rate_limit_or_empty or is_timeout):
114+
sleep_time = 2**list_retries
106115
logger.warning(
107-
"Retrying trace.list in %ds (attempt %d/%d): %s", sleep_time, list_retries, max_retries, str(e)
116+
"Retrying trace.list in %ds (attempt %d/%d): %s",
117+
sleep_time,
118+
list_retries,
119+
max_retries,
120+
str(e),
108121
)
109122
await asyncio.sleep(sleep_time)
110123
elif list_retries == max_retries:
111124
# Return 404 if we've retried max_retries
112125
# TODO: write some tests around proxy exception handling
113-
logger.error("Failed to fetch trace list after %d retries: %s", max_retries, e)
126+
logger.error(
127+
"Failed to fetch trace list after %d retries (rollout_id=%s): %s",
128+
max_retries,
129+
rollout_id,
130+
e,
131+
)
114132
raise HTTPException(
115133
status_code=404, detail=f"Failed to fetch traces after {max_retries} retries: {str(e)}"
116134
)
117135
else:
118136
# Catch all other exceptions
119-
logger.error("Failed to fetch trace list: %s", e)
137+
logger.error("Failed to fetch trace list (rollout_id=%s): %s", rollout_id, e)
120138
raise HTTPException(status_code=500, detail=f"Failed to fetch traces: {str(e)}")
121139

122140

0 commit comments

Comments
 (0)