2222from deepeval .test_case import LLMTestCase , LLMTestCaseParams
2323
2424from pydantic import ValidationError
25- from tenacity import (
26- retry ,
27- retry_if_exception ,
28- stop_after_attempt ,
29- wait_exponential ,
30- before_sleep_log ,
31- )
3225
3326from lightspeed_evaluation .core .llm .deepeval import DeepEvalLLMManager
3427from lightspeed_evaluation .core .metrics .manager import MetricLevel , MetricManager
@@ -64,11 +57,6 @@ def __init__(
6457 self .deepeval_llm_manager = deepeval_llm_manager
6558 self .metric_manager = metric_manager
6659
67- # Get num_retries from LLM configuration (default: 6 to match DeepEval's hardcoded value)
68- # Note: DeepEval's internal retry logic uses hardcoded MAX_RETRIES=6,
69- # but we add our own retry layer to respect user configuration
70- self .num_retries = self .deepeval_llm_manager .llm_params .get ("num_retries" , 6 )
71-
7260 def evaluate ( # pylint: disable=R0913,R0917
7361 self ,
7462 metric_name : str ,
@@ -214,67 +202,6 @@ def _convert_evaluation_params(
214202 # Return the successfully converted list, or None if it ended up empty
215203 return converted if converted else None
216204
217- def _is_retryable_exception (self , exception : BaseException ) -> bool :
218- """Check if exception should trigger a retry.
219-
220- Retryable conditions:
221- - Rate limiting (429 errors from LLM provider)
222- - Timeout errors
223- - Temporary network failures
224- - LLM provider temporary errors
225-
226- Args:
227- exception: The exception to check
228-
229- Returns:
230- True if the exception should trigger a retry, False otherwise
231- """
232- # We retry on all exceptions because DeepEval/LiteLLM internally
233- # handles specific error types (RateLimitError, Timeout, etc.)
234- # This matches DeepEval's hardcoded behavior: retryable_exceptions = (Exception,)
235- return isinstance (exception , Exception )
236-
237- def _measure_with_retry (
238- self , metric : GEval , test_case : LLMTestCase , context : str
239- ) -> None :
240- """Execute metric.measure() with configurable retry logic.
241-
242- This method wraps DeepEval's metric.measure() with our own retry layer
243- to respect user-configured num_retries (DeepEval hardcodes MAX_RETRIES=6).
244-
245- Args:
246- metric: GEval metric instance
247- test_case: LLM test case to evaluate
248- context: Description for logging (e.g., "turn-level" or "conversation-level")
249-
250- Raises:
251- Exception: Re-raises the last exception if all retry attempts fail
252- """
253- retry_decorator = retry (
254- retry = retry_if_exception (self ._is_retryable_exception ),
255- stop = stop_after_attempt (self .num_retries ),
256- wait = wait_exponential (multiplier = 1 , min = 1 , max = 10 ),
257- before_sleep = before_sleep_log (logger , logging .WARNING ),
258- reraise = True ,
259- )
260-
261- @retry_decorator
262- def _measure () -> None :
263- metric .measure (test_case )
264- self .deepeval_llm_manager .flush_deepevals_pending_tasks ()
265-
266- try :
267- _measure ()
268- except Exception as e :
269- logger .error (
270- "GEval %s evaluation failed after %d retry attempts: %s: %s" ,
271- context ,
272- self .num_retries ,
273- type (e ).__name__ ,
274- str (e ),
275- )
276- raise
277-
278205 def _evaluate_turn ( # pylint: disable=R0913,R0917
279206 self ,
280207 turn_data : Any ,
@@ -366,9 +293,10 @@ def _evaluate_turn( # pylint: disable=R0913,R0917
366293 # Create test case for a single turn
367294 test_case = LLMTestCase (** test_case_kwargs )
368295
369- # Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is)
296+ # Evaluate (DeepEval normalizes score to [0, 1]; pass through as-is)
370297 try :
371- self ._measure_with_retry (metric , test_case , "turn-level" )
298+ metric .measure (test_case )
299+ self .deepeval_llm_manager .flush_deepevals_pending_tasks ()
372300
373301 # Extract score and reason
374302 score = metric .score
@@ -397,6 +325,9 @@ def _evaluate_turn( # pylint: disable=R0913,R0917
397325
398326 return score , reason
399327 except Exception as e : # pylint: disable=W0718
328+ logger .error (
329+ "GEval turn-level evaluation failed: %s: %s" , type (e ).__name__ , str (e )
330+ )
400331 logger .debug (
401332 "Test case input: %s..." ,
402333 test_case .input [:100 ] if test_case .input else "None" ,
@@ -494,9 +425,10 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914
494425 actual_output = "\n " .join (conversation_output ),
495426 )
496427
497- # Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is)
428+ # Evaluate (DeepEval normalizes score to [0, 1]; pass through as-is)
498429 try :
499- self ._measure_with_retry (metric , test_case , "conversation-level" )
430+ metric .measure (test_case )
431+ self .deepeval_llm_manager .flush_deepevals_pending_tasks ()
500432
501433 # Extract score and reason
502434 score = metric .score
@@ -519,6 +451,11 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914
519451
520452 return score , reason
521453 except Exception as e : # pylint: disable=W0718
454+ logger .error (
455+ "GEval conversation-level evaluation failed: %s: %s" ,
456+ type (e ).__name__ ,
457+ str (e ),
458+ )
522459 logger .debug ("Conversation turns: %d" , len (conv_data .turns ))
523460 logger .debug (
524461 "Test case input preview: %s..." ,
0 commit comments