|
22 | 22 | from deepeval.test_case import LLMTestCase, LLMTestCaseParams |
23 | 23 |
|
24 | 24 | from pydantic import ValidationError |
| 25 | +from tenacity import ( |
| 26 | + retry, |
| 27 | + retry_if_exception, |
| 28 | + stop_after_attempt, |
| 29 | + wait_exponential, |
| 30 | + before_sleep_log, |
| 31 | +) |
25 | 32 |
|
26 | 33 | from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager |
27 | 34 | from lightspeed_evaluation.core.metrics.manager import MetricLevel, MetricManager |
@@ -57,6 +64,11 @@ def __init__( |
57 | 64 | self.deepeval_llm_manager = deepeval_llm_manager |
58 | 65 | self.metric_manager = metric_manager |
59 | 66 |
|
| 67 | + # Get num_retries from LLM configuration (default: 6 to match DeepEval's hardcoded value) |
| 68 | + # Note: DeepEval's internal retry logic uses hardcoded MAX_RETRIES=6, |
| 69 | + # but we add our own retry layer to respect user configuration |
| 70 | + self.num_retries = self.deepeval_llm_manager.llm_params.get("num_retries", 6) |
| 71 | + |
60 | 72 | def evaluate( # pylint: disable=R0913,R0917 |
61 | 73 | self, |
62 | 74 | metric_name: str, |
@@ -202,6 +214,67 @@ def _convert_evaluation_params( |
202 | 214 | # Return the successfully converted list, or None if it ended up empty |
203 | 215 | return converted if converted else None |
204 | 216 |
|
| 217 | + def _is_retryable_exception(self, exception: BaseException) -> bool: |
| 218 | + """Check if exception should trigger a retry. |
| 219 | +
|
| 220 | + Retryable conditions: |
| 221 | + - Rate limiting (429 errors from LLM provider) |
| 222 | + - Timeout errors |
| 223 | + - Temporary network failures |
| 224 | + - LLM provider temporary errors |
| 225 | +
|
| 226 | + Args: |
| 227 | + exception: The exception to check |
| 228 | +
|
| 229 | + Returns: |
| 230 | + True if the exception should trigger a retry, False otherwise |
| 231 | + """ |
| 232 | + # We retry on all exceptions because DeepEval/LiteLLM internally |
| 233 | + # handles specific error types (RateLimitError, Timeout, etc.) |
| 234 | + # This matches DeepEval's hardcoded behavior: retryable_exceptions = (Exception,) |
| 235 | + return isinstance(exception, Exception) |
| 236 | + |
| 237 | + def _measure_with_retry( |
| 238 | + self, metric: GEval, test_case: LLMTestCase, context: str |
| 239 | + ) -> None: |
| 240 | + """Execute metric.measure() with configurable retry logic. |
| 241 | +
|
| 242 | + This method wraps DeepEval's metric.measure() with our own retry layer |
| 243 | + to respect user-configured num_retries (DeepEval hardcodes MAX_RETRIES=6). |
| 244 | +
|
| 245 | + Args: |
| 246 | + metric: GEval metric instance |
| 247 | + test_case: LLM test case to evaluate |
| 248 | + context: Description for logging (e.g., "turn-level" or "conversation-level") |
| 249 | +
|
| 250 | + Raises: |
| 251 | + Exception: Re-raises the last exception if all retry attempts fail |
| 252 | + """ |
| 253 | + retry_decorator = retry( |
| 254 | + retry=retry_if_exception(self._is_retryable_exception), |
| 255 | + stop=stop_after_attempt(self.num_retries), |
| 256 | + wait=wait_exponential(multiplier=1, min=1, max=10), |
| 257 | + before_sleep=before_sleep_log(logger, logging.WARNING), |
| 258 | + reraise=True, |
| 259 | + ) |
| 260 | + |
| 261 | + @retry_decorator |
| 262 | + def _measure() -> None: |
| 263 | + metric.measure(test_case) |
| 264 | + self.deepeval_llm_manager.flush_deepevals_pending_tasks() |
| 265 | + |
| 266 | + try: |
| 267 | + _measure() |
| 268 | + except Exception as e: |
| 269 | + logger.error( |
| 270 | + "GEval %s evaluation failed after %d retry attempts: %s: %s", |
| 271 | + context, |
| 272 | + self.num_retries, |
| 273 | + type(e).__name__, |
| 274 | + str(e), |
| 275 | + ) |
| 276 | + raise |
| 277 | + |
205 | 278 | def _evaluate_turn( # pylint: disable=R0913,R0917 |
206 | 279 | self, |
207 | 280 | turn_data: Any, |
@@ -293,22 +366,37 @@ def _evaluate_turn( # pylint: disable=R0913,R0917 |
293 | 366 | # Create test case for a single turn |
294 | 367 | test_case = LLMTestCase(**test_case_kwargs) |
295 | 368 |
|
296 | | - # Evaluate (DeepEval normalizes score to [0, 1]; pass through as-is) |
| 369 | + # Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is) |
297 | 370 | try: |
298 | | - metric.measure(test_case) |
299 | | - self.deepeval_llm_manager.flush_deepevals_pending_tasks() |
| 371 | + self._measure_with_retry(metric, test_case, "turn-level") |
300 | 372 |
|
301 | | - score = metric.score if metric.score is not None else 0.0 |
| 373 | + # Extract score and reason |
| 374 | + score = metric.score |
302 | 375 | reason = ( |
303 | 376 | str(metric.reason) |
304 | 377 | if hasattr(metric, "reason") and metric.reason |
305 | 378 | else "No reason provided" |
306 | 379 | ) |
| 380 | + |
| 381 | + # CRITICAL: Warn if score is None (indicates evaluation failure) |
| 382 | + # Without this warning, silent conversion to 0.0 masks bugs like: |
| 383 | + # - Rate limiting (429 errors after all retries exhausted) |
| 384 | + # - LLM judge returning malformed JSON that fails parsing |
| 385 | + # - Timeout errors from LLM provider |
| 386 | + # - API quota/credits exhausted |
| 387 | + # This makes debugging nearly impossible as failed evaluations |
| 388 | + # appear as low scores (0.0) instead of errors. |
| 389 | + if score is None: |
| 390 | + logger.warning( |
| 391 | + "GEval turn-level metric returned None score; defaulting to 0.0. " |
| 392 | + "This typically indicates LLM judge failure (rate limiting, timeout, " |
| 393 | + "invalid JSON response, or quota exhausted). Reason: %s", |
| 394 | + reason, |
| 395 | + ) |
| 396 | + score = 0.0 |
| 397 | + |
307 | 398 | return score, reason |
308 | 399 | except Exception as e: # pylint: disable=W0718 |
309 | | - logger.error( |
310 | | - "GEval turn-level evaluation failed: %s: %s", type(e).__name__, str(e) |
311 | | - ) |
312 | 400 | logger.debug( |
313 | 401 | "Test case input: %s...", |
314 | 402 | test_case.input[:100] if test_case.input else "None", |
@@ -406,24 +494,31 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914 |
406 | 494 | actual_output="\n".join(conversation_output), |
407 | 495 | ) |
408 | 496 |
|
409 | | - # Evaluate (DeepEval normalizes score to [0, 1]; pass through as-is) |
| 497 | + # Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is) |
410 | 498 | try: |
411 | | - metric.measure(test_case) |
412 | | - self.deepeval_llm_manager.flush_deepevals_pending_tasks() |
| 499 | + self._measure_with_retry(metric, test_case, "conversation-level") |
413 | 500 |
|
414 | | - score = metric.score if metric.score is not None else 0.0 |
| 501 | + # Extract score and reason |
| 502 | + score = metric.score |
415 | 503 | reason = ( |
416 | 504 | str(metric.reason) |
417 | 505 | if hasattr(metric, "reason") and metric.reason |
418 | 506 | else "No reason provided" |
419 | 507 | ) |
| 508 | + |
| 509 | + # CRITICAL: Warn if score is None (indicates evaluation failure) |
| 510 | + # See turn-level evaluation for detailed explanation of why this matters |
| 511 | + if score is None: |
| 512 | + logger.warning( |
| 513 | + "GEval conversation-level metric returned None score; defaulting to 0.0. " |
| 514 | + "This typically indicates LLM judge failure (rate limiting, timeout, " |
| 515 | + "invalid JSON response, or quota exhausted). Reason: %s", |
| 516 | + reason, |
| 517 | + ) |
| 518 | + score = 0.0 |
| 519 | + |
420 | 520 | return score, reason |
421 | 521 | except Exception as e: # pylint: disable=W0718 |
422 | | - logger.error( |
423 | | - "GEval conversation-level evaluation failed: %s: %s", |
424 | | - type(e).__name__, |
425 | | - str(e), |
426 | | - ) |
427 | 522 | logger.debug("Conversation turns: %d", len(conv_data.turns)) |
428 | 523 | logger.debug( |
429 | 524 | "Test case input preview: %s...", |
|
0 commit comments