2222from deepeval .test_case import LLMTestCase , LLMTestCaseParams
2323
2424from pydantic import ValidationError
25- from tenacity import (
26- retry ,
27- retry_if_exception ,
28- stop_after_attempt ,
29- wait_exponential ,
30- before_sleep_log ,
31- )
3225
3326from lightspeed_evaluation .core .llm .deepeval import DeepEvalLLMManager
3427from lightspeed_evaluation .core .metrics .manager import MetricLevel , MetricManager
@@ -64,11 +57,6 @@ def __init__(
6457 self .deepeval_llm_manager = deepeval_llm_manager
6558 self .metric_manager = metric_manager
6659
67- # Get num_retries from LLM configuration (default: 6 to match DeepEval's hardcoded value)
68- # Note: DeepEval's internal retry logic uses hardcoded MAX_RETRIES=6,
69- # but we add our own retry layer to respect user configuration
70- self .num_retries = self .deepeval_llm_manager .llm_params .get ("num_retries" , 6 )
71-
7260 def evaluate ( # pylint: disable=R0913,R0917
7361 self ,
7462 metric_name : str ,
@@ -214,67 +202,6 @@ def _convert_evaluation_params(
214202 # Return the successfully converted list, or None if it ended up empty
215203 return converted if converted else None
216204
217- def _is_retryable_exception (self , exception : BaseException ) -> bool :
218- """Check if exception should trigger a retry.
219-
220- Retryable conditions:
221- - Rate limiting (429 errors from LLM provider)
222- - Timeout errors
223- - Temporary network failures
224- - LLM provider temporary errors
225-
226- Args:
227- exception: The exception to check
228-
229- Returns:
230- True if the exception should trigger a retry, False otherwise
231- """
232- # We retry on all exceptions because DeepEval/LiteLLM internally
233- # handles specific error types (RateLimitError, Timeout, etc.)
234- # This matches DeepEval's hardcoded behavior: retryable_exceptions = (Exception,)
235- return isinstance (exception , Exception )
236-
237- def _measure_with_retry (
238- self , metric : GEval , test_case : LLMTestCase , context : str
239- ) -> None :
240- """Execute metric.measure() with configurable retry logic.
241-
242- This method wraps DeepEval's metric.measure() with our own retry layer
243- to respect user-configured num_retries (DeepEval hardcodes MAX_RETRIES=6).
244-
245- Args:
246- metric: GEval metric instance
247- test_case: LLM test case to evaluate
248- context: Description for logging (e.g., "turn-level" or "conversation-level")
249-
250- Raises:
251- Exception: Re-raises the last exception if all retry attempts fail
252- """
253- retry_decorator = retry (
254- retry = retry_if_exception (self ._is_retryable_exception ),
255- stop = stop_after_attempt (self .num_retries ),
256- wait = wait_exponential (multiplier = 1 , min = 1 , max = 10 ),
257- before_sleep = before_sleep_log (logger , logging .WARNING ),
258- reraise = True ,
259- )
260-
261- @retry_decorator
262- def _measure () -> None :
263- metric .measure (test_case )
264- self .deepeval_llm_manager .flush_deepevals_pending_tasks ()
265-
266- try :
267- _measure ()
268- except Exception as e :
269- logger .error (
270- "GEval %s evaluation failed after %d retry attempts: %s: %s" ,
271- context ,
272- self .num_retries ,
273- type (e ).__name__ ,
274- str (e ),
275- )
276- raise
277-
278205 def _evaluate_turn ( # pylint: disable=R0913,R0917
279206 self ,
280207 turn_data : Any ,
@@ -368,7 +295,8 @@ def _evaluate_turn( # pylint: disable=R0913,R0917
368295
369296 # Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is)
370297 try :
371- self ._measure_with_retry (metric , test_case , "turn-level" )
298+ metric .measure (test_case )
299+ self .deepeval_llm_manager .flush_deepevals_pending_tasks ()
372300
373301 # Extract score and reason
374302 score = metric .score
@@ -379,21 +307,19 @@ def _evaluate_turn( # pylint: disable=R0913,R0917
379307 )
380308
381309 # CRITICAL: Warn if score is None (indicates evaluation failure)
382- # Without this warning, silent conversion to 0.0 masks bugs like :
310+ # None scores indicate evaluation failures that need investigation :
383311 # - Rate limiting (429 errors after all retries exhausted)
384312 # - LLM judge returning malformed JSON that fails parsing
385313 # - Timeout errors from LLM provider
386314 # - API quota/credits exhausted
387- # This makes debugging nearly impossible as failed evaluations
388- # appear as low scores (0.0) instead of errors.
315+ # Warning helps identify these failures for debugging.
389316 if score is None :
390317 logger .warning (
391- "GEval turn-level metric returned None score; defaulting to 0.0 . "
318+ "GEval turn-level metric returned None score. "
392319 "This typically indicates LLM judge failure (rate limiting, timeout, "
393320 "invalid JSON response, or quota exhausted). Reason: %s" ,
394321 reason ,
395322 )
396- score = 0.0
397323
398324 return score , reason
399325 except Exception as e : # pylint: disable=W0718
@@ -496,7 +422,8 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914
496422
497423 # Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is)
498424 try :
499- self ._measure_with_retry (metric , test_case , "conversation-level" )
425+ metric .measure (test_case )
426+ self .deepeval_llm_manager .flush_deepevals_pending_tasks ()
500427
501428 # Extract score and reason
502429 score = metric .score
@@ -510,12 +437,11 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914
510437 # See turn-level evaluation for detailed explanation of why this matters
511438 if score is None :
512439 logger .warning (
513- "GEval conversation-level metric returned None score; defaulting to 0.0 . "
440+ "GEval conversation-level metric returned None score. "
514441 "This typically indicates LLM judge failure (rate limiting, timeout, "
515442 "invalid JSON response, or quota exhausted). Reason: %s" ,
516443 reason ,
517444 )
518- score = 0.0
519445
520446 return score , reason
521447 except Exception as e : # pylint: disable=W0718
0 commit comments