2121from deepeval .metrics .g_eval import Rubric
2222from deepeval .test_case import LLMTestCase , LLMTestCaseParams
2323from pydantic import ValidationError
24- from tenacity import (
25- retry ,
26- retry_if_exception ,
27- stop_after_attempt ,
28- wait_exponential ,
29- before_sleep_log ,
30- )
3124
3225from lightspeed_evaluation .core .llm .deepeval import DeepEvalLLMManager
3326from lightspeed_evaluation .core .metrics .manager import MetricLevel , MetricManager
@@ -63,11 +56,6 @@ def __init__(
6356 self .deepeval_llm_manager = deepeval_llm_manager
6457 self .metric_manager = metric_manager
6558
66- # Get num_retries from LLM configuration (default: 6 to match DeepEval's hardcoded value)
67- # Note: DeepEval's internal retry logic uses hardcoded MAX_RETRIES=6,
68- # but we add our own retry layer to respect user configuration
69- self .num_retries = self .deepeval_llm_manager .llm_params .get ("num_retries" , 6 )
70-
7159 def evaluate ( # pylint: disable=R0913,R0917
7260 self ,
7361 metric_name : str ,
@@ -213,67 +201,6 @@ def _convert_evaluation_params(
213201 # Return the successfully converted list, or None if it ended up empty
214202 return converted if converted else None
215203
216- def _is_retryable_exception (self , exception : BaseException ) -> bool :
217- """Check if exception should trigger a retry.
218-
219- Retryable conditions:
220- - Rate limiting (429 errors from LLM provider)
221- - Timeout errors
222- - Temporary network failures
223- - LLM provider temporary errors
224-
225- Args:
226- exception: The exception to check
227-
228- Returns:
229- True if the exception should trigger a retry, False otherwise
230- """
231- # We retry on all exceptions because DeepEval/LiteLLM internally
232- # handles specific error types (RateLimitError, Timeout, etc.)
233- # This matches DeepEval's hardcoded behavior: retryable_exceptions = (Exception,)
234- return isinstance (exception , Exception )
235-
236- def _measure_with_retry (
237- self , metric : GEval , test_case : LLMTestCase , context : str
238- ) -> None :
239- """Execute metric.measure() with configurable retry logic.
240-
241- This method wraps DeepEval's metric.measure() with our own retry layer
242- to respect user-configured num_retries (DeepEval hardcodes MAX_RETRIES=6).
243-
244- Args:
245- metric: GEval metric instance
246- test_case: LLM test case to evaluate
247- context: Description for logging (e.g., "turn-level" or "conversation-level")
248-
249- Raises:
250- Exception: Re-raises the last exception if all retry attempts fail
251- """
252- retry_decorator = retry (
253- retry = retry_if_exception (self ._is_retryable_exception ),
254- stop = stop_after_attempt (self .num_retries ),
255- wait = wait_exponential (multiplier = 1 , min = 1 , max = 10 ),
256- before_sleep = before_sleep_log (logger , logging .WARNING ),
257- reraise = True ,
258- )
259-
260- @retry_decorator
261- def _measure () -> None :
262- metric .measure (test_case )
263- self .deepeval_llm_manager .flush_deepevals_pending_tasks ()
264-
265- try :
266- _measure ()
267- except Exception as e :
268- logger .error (
269- "GEval %s evaluation failed after %d retry attempts: %s: %s" ,
270- context ,
271- self .num_retries ,
272- type (e ).__name__ ,
273- str (e ),
274- )
275- raise
276-
277204 def _evaluate_turn ( # pylint: disable=R0913,R0917
278205 self ,
279206 turn_data : Any ,
@@ -367,7 +294,8 @@ def _evaluate_turn( # pylint: disable=R0913,R0917
367294
368295 # Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is)
369296 try :
370- self ._measure_with_retry (metric , test_case , "turn-level" )
297+ metric .measure (test_case )
298+ self .deepeval_llm_manager .flush_deepevals_pending_tasks ()
371299
372300 # Extract score and reason
373301 score = metric .score
@@ -378,24 +306,25 @@ def _evaluate_turn( # pylint: disable=R0913,R0917
378306 )
379307
380308 # CRITICAL: Warn if score is None (indicates evaluation failure)
381- # Without this warning, silent conversion to 0.0 masks bugs like :
309+ # None scores indicate evaluation failures that need investigation :
382310 # - Rate limiting (429 errors after all retries exhausted)
383311 # - LLM judge returning malformed JSON that fails parsing
384312 # - Timeout errors from LLM provider
385313 # - API quota/credits exhausted
386- # This makes debugging nearly impossible as failed evaluations
387- # appear as low scores (0.0) instead of errors.
314+ # Warning helps identify these failures for debugging.
388315 if score is None :
389316 logger .warning (
390- "GEval turn-level metric returned None score; defaulting to 0.0 . "
317+ "GEval turn-level metric returned None score. "
391318 "This typically indicates LLM judge failure (rate limiting, timeout, "
392319 "invalid JSON response, or quota exhausted). Reason: %s" ,
393320 reason ,
394321 )
395- score = 0.0
396322
397323 return score , reason
398324 except Exception as e : # pylint: disable=W0718
325+ logger .error (
326+ "GEval turn-level evaluation failed: %s: %s" , type (e ).__name__ , str (e )
327+ )
399328 logger .debug (
400329 "Test case input: %s..." ,
401330 test_case .input [:100 ] if test_case .input else "None" ,
@@ -495,7 +424,8 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914
495424
496425 # Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is)
497426 try :
498- self ._measure_with_retry (metric , test_case , "conversation-level" )
427+ metric .measure (test_case )
428+ self .deepeval_llm_manager .flush_deepevals_pending_tasks ()
499429
500430 # Extract score and reason
501431 score = metric .score
@@ -509,15 +439,19 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914
509439 # See turn-level evaluation for detailed explanation of why this matters
510440 if score is None :
511441 logger .warning (
512- "GEval conversation-level metric returned None score; defaulting to 0.0 . "
442+ "GEval conversation-level metric returned None score. "
513443 "This typically indicates LLM judge failure (rate limiting, timeout, "
514444 "invalid JSON response, or quota exhausted). Reason: %s" ,
515445 reason ,
516446 )
517- score = 0.0
518447
519448 return score , reason
520449 except Exception as e : # pylint: disable=W0718
450+ logger .error (
451+ "GEval conversation-level evaluation failed: %s: %s" ,
452+ type (e ).__name__ ,
453+ str (e ),
454+ )
521455 logger .debug ("Conversation turns: %d" , len (conv_data .turns ))
522456 logger .debug (
523457 "Test case input preview: %s..." ,
0 commit comments