Skip to content

Commit e28e2b8

Browse files
committed
fix: Retry logic for Deepevals score=None bug
1 parent a058015 commit e28e2b8

9 files changed

Lines changed: 457 additions & 154 deletions

File tree

bug_test_geval_score_mismatch.yaml

Lines changed: 0 additions & 54 deletions
This file was deleted.

config/system.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ core:
2525
llm_pool:
2626
defaults:
2727
timeout: 300
28-
num_retries: 3 # Retry attempts for LLM judge (applies to GEval, custom metrics, etc.; default: 6 if not specified)
28+
num_retries: 3
2929
parameters:
3030
temperature: 0.0
3131
max_completion_tokens: 1024

src/lightspeed_evaluation/core/llm/deepeval.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010

1111
import litellm
1212
from deepeval.models import LiteLLMModel
13+
from tenacity import stop_after_attempt
1314

15+
from lightspeed_evaluation.core.constants import DEFAULT_LLM_RETRIES
1416
from lightspeed_evaluation.core.llm.litellm_patch import setup_litellm_ssl
1517

1618
logger = logging.getLogger(__name__)
@@ -46,15 +48,46 @@ def __init__(self, model_name: str, llm_params: dict[str, Any]):
4648
# LiteLLMModel stores **kwargs in self.kwargs and merges them into
4749
# every litellm.completion() call
4850
# Note: Forbidden keys are rejected at LLMParametersConfig load time
51+
52+
# Override DeepEval's hardcoded retry logic with user configuration
53+
# DeepEval uses @retry decorators that capture MAX_RETRIES at import time
54+
# We must patch the retry decorators after import but before instantiation
55+
num_retries = self.llm_params.get("num_retries", DEFAULT_LLM_RETRIES)
56+
57+
self._patch_deepeval_retries(num_retries)
58+
4959
self.llm_model = LiteLLMModel(
5060
model=self.model_name,
5161
timeout=self.llm_params.get("timeout"),
52-
num_retries=self.llm_params.get("num_retries"),
5362
**self.llm_params.get("parameters", {}),
5463
)
5564

5665
print(f"✅ DeepEval LLM Manager: {self.model_name}")
5766

67+
def _patch_deepeval_retries(self, max_retries: int) -> None:
68+
"""Monkey-patch DeepEval's retry decorators to use configured max_retries.
69+
70+
DeepEval's @retry decorators capture MAX_RETRIES at import time.
71+
We patch the 'stop' attribute on each retry decorator to use our value.
72+
"""
73+
# Patch the stop condition on all retry-decorated methods
74+
for method_name in [
75+
"generate",
76+
"a_generate",
77+
"generate_raw_response",
78+
"a_generate_raw_response",
79+
"generate_samples",
80+
]:
81+
method = getattr(LiteLLMModel, method_name)
82+
method.retry.stop = stop_after_attempt( # pylint: disable=no-member
83+
max_retries
84+
)
85+
86+
logger.info(
87+
"Patched DeepEval retry logic: max_retries=%d",
88+
max_retries,
89+
)
90+
5891
def setup_ssl_verify(self) -> None:
5992
"""Setup SSL verification based on LLM parameters.
6093

src/lightspeed_evaluation/core/metrics/deepeval.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,17 +87,34 @@ def _build_conversational_test_case(self, conv_data: Any) -> ConversationalTestC
8787

8888
return ConversationalTestCase(turns=turns)
8989

90-
def _evaluate_metric(self, metric: Any, test_case: Any) -> tuple[float, str]:
90+
def _evaluate_metric(self, metric: Any, test_case: Any) -> tuple[float | None, str]:
9191
"""Evaluate and get result."""
9292
metric.measure(test_case)
9393
self.llm_manager.flush_deepevals_pending_tasks()
9494

95+
score = metric.score
9596
reason = (
9697
metric.reason
9798
if hasattr(metric, "reason") and metric.reason
98-
else f"Score: {metric.score:.2f}"
99+
else f"Score: {score:.2f}" if score is not None else "No score returned"
99100
)
100-
return metric.score, reason
101+
102+
# CRITICAL: Warn if score is None (indicates evaluation failure)
103+
# None scores indicate evaluation failures that need investigation:
104+
# - Rate limiting (429 errors)
105+
# - LLM judge returning malformed JSON that fails parsing
106+
# - Timeout errors from LLM provider
107+
# - API quota/credits exhausted
108+
if score is None:
109+
logger.warning(
110+
"%s metric returned None score. "
111+
"This typically indicates LLM judge failure (rate limiting, timeout, "
112+
"invalid JSON response, or quota exhausted). Reason: %s",
113+
metric.__class__.__name__,
114+
reason,
115+
)
116+
117+
return score, reason
101118

102119
def evaluate(
103120
self,
@@ -117,7 +134,8 @@ def evaluate(
117134
scope: EvaluationScope containing turn info and conversation flag
118135
119136
Returns:
120-
Tuple of (score, reason)
137+
tuple[float | None, str]: Tuple of (score, reason).
138+
Score is in [0, 1] or None if evaluation failed.
121139
"""
122140
# Route to standard DeepEval metrics
123141
if metric_name in self.supported_metrics:

src/lightspeed_evaluation/core/metrics/geval.py

Lines changed: 16 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,6 @@
2121
from deepeval.metrics.g_eval import Rubric
2222
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
2323
from pydantic import ValidationError
24-
from tenacity import (
25-
retry,
26-
retry_if_exception,
27-
stop_after_attempt,
28-
wait_exponential,
29-
before_sleep_log,
30-
)
3124

3225
from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager
3326
from lightspeed_evaluation.core.metrics.manager import MetricLevel, MetricManager
@@ -63,11 +56,6 @@ def __init__(
6356
self.deepeval_llm_manager = deepeval_llm_manager
6457
self.metric_manager = metric_manager
6558

66-
# Get num_retries from LLM configuration (default: 6 to match DeepEval's hardcoded value)
67-
# Note: DeepEval's internal retry logic uses hardcoded MAX_RETRIES=6,
68-
# but we add our own retry layer to respect user configuration
69-
self.num_retries = self.deepeval_llm_manager.llm_params.get("num_retries", 6)
70-
7159
def evaluate( # pylint: disable=R0913,R0917
7260
self,
7361
metric_name: str,
@@ -213,67 +201,6 @@ def _convert_evaluation_params(
213201
# Return the successfully converted list, or None if it ended up empty
214202
return converted if converted else None
215203

216-
def _is_retryable_exception(self, exception: BaseException) -> bool:
217-
"""Check if exception should trigger a retry.
218-
219-
Retryable conditions:
220-
- Rate limiting (429 errors from LLM provider)
221-
- Timeout errors
222-
- Temporary network failures
223-
- LLM provider temporary errors
224-
225-
Args:
226-
exception: The exception to check
227-
228-
Returns:
229-
True if the exception should trigger a retry, False otherwise
230-
"""
231-
# We retry on all exceptions because DeepEval/LiteLLM internally
232-
# handles specific error types (RateLimitError, Timeout, etc.)
233-
# This matches DeepEval's hardcoded behavior: retryable_exceptions = (Exception,)
234-
return isinstance(exception, Exception)
235-
236-
def _measure_with_retry(
237-
self, metric: GEval, test_case: LLMTestCase, context: str
238-
) -> None:
239-
"""Execute metric.measure() with configurable retry logic.
240-
241-
This method wraps DeepEval's metric.measure() with our own retry layer
242-
to respect user-configured num_retries (DeepEval hardcodes MAX_RETRIES=6).
243-
244-
Args:
245-
metric: GEval metric instance
246-
test_case: LLM test case to evaluate
247-
context: Description for logging (e.g., "turn-level" or "conversation-level")
248-
249-
Raises:
250-
Exception: Re-raises the last exception if all retry attempts fail
251-
"""
252-
retry_decorator = retry(
253-
retry=retry_if_exception(self._is_retryable_exception),
254-
stop=stop_after_attempt(self.num_retries),
255-
wait=wait_exponential(multiplier=1, min=1, max=10),
256-
before_sleep=before_sleep_log(logger, logging.WARNING),
257-
reraise=True,
258-
)
259-
260-
@retry_decorator
261-
def _measure() -> None:
262-
metric.measure(test_case)
263-
self.deepeval_llm_manager.flush_deepevals_pending_tasks()
264-
265-
try:
266-
_measure()
267-
except Exception as e:
268-
logger.error(
269-
"GEval %s evaluation failed after %d retry attempts: %s: %s",
270-
context,
271-
self.num_retries,
272-
type(e).__name__,
273-
str(e),
274-
)
275-
raise
276-
277204
def _evaluate_turn( # pylint: disable=R0913,R0917
278205
self,
279206
turn_data: Any,
@@ -367,7 +294,8 @@ def _evaluate_turn( # pylint: disable=R0913,R0917
367294

368295
# Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is)
369296
try:
370-
self._measure_with_retry(metric, test_case, "turn-level")
297+
metric.measure(test_case)
298+
self.deepeval_llm_manager.flush_deepevals_pending_tasks()
371299

372300
# Extract score and reason
373301
score = metric.score
@@ -378,24 +306,25 @@ def _evaluate_turn( # pylint: disable=R0913,R0917
378306
)
379307

380308
# CRITICAL: Warn if score is None (indicates evaluation failure)
381-
# Without this warning, silent conversion to 0.0 masks bugs like:
309+
# None scores indicate evaluation failures that need investigation:
382310
# - Rate limiting (429 errors after all retries exhausted)
383311
# - LLM judge returning malformed JSON that fails parsing
384312
# - Timeout errors from LLM provider
385313
# - API quota/credits exhausted
386-
# This makes debugging nearly impossible as failed evaluations
387-
# appear as low scores (0.0) instead of errors.
314+
# Warning helps identify these failures for debugging.
388315
if score is None:
389316
logger.warning(
390-
"GEval turn-level metric returned None score; defaulting to 0.0. "
317+
"GEval turn-level metric returned None score. "
391318
"This typically indicates LLM judge failure (rate limiting, timeout, "
392319
"invalid JSON response, or quota exhausted). Reason: %s",
393320
reason,
394321
)
395-
score = 0.0
396322

397323
return score, reason
398324
except Exception as e: # pylint: disable=W0718
325+
logger.error(
326+
"GEval turn-level evaluation failed: %s: %s", type(e).__name__, str(e)
327+
)
399328
logger.debug(
400329
"Test case input: %s...",
401330
test_case.input[:100] if test_case.input else "None",
@@ -495,7 +424,8 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914
495424

496425
# Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is)
497426
try:
498-
self._measure_with_retry(metric, test_case, "conversation-level")
427+
metric.measure(test_case)
428+
self.deepeval_llm_manager.flush_deepevals_pending_tasks()
499429

500430
# Extract score and reason
501431
score = metric.score
@@ -509,15 +439,19 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914
509439
# See turn-level evaluation for detailed explanation of why this matters
510440
if score is None:
511441
logger.warning(
512-
"GEval conversation-level metric returned None score; defaulting to 0.0. "
442+
"GEval conversation-level metric returned None score. "
513443
"This typically indicates LLM judge failure (rate limiting, timeout, "
514444
"invalid JSON response, or quota exhausted). Reason: %s",
515445
reason,
516446
)
517-
score = 0.0
518447

519448
return score, reason
520449
except Exception as e: # pylint: disable=W0718
450+
logger.error(
451+
"GEval conversation-level evaluation failed: %s: %s",
452+
type(e).__name__,
453+
str(e),
454+
)
521455
logger.debug("Conversation turns: %d", len(conv_data.turns))
522456
logger.debug(
523457
"Test case input preview: %s...",

0 commit comments

Comments
 (0)