Skip to content

Commit b7d9bfd

Browse files
committed
fix: Retry logic for Deepevals score=None bug
1 parent fda6b29 commit b7d9bfd

9 files changed

Lines changed: 449 additions & 154 deletions

File tree

bug_test_geval_score_mismatch.yaml

Lines changed: 0 additions & 54 deletions
This file was deleted.

config/system.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ core:
2525
llm_pool:
2626
defaults:
2727
timeout: 300
28-
num_retries: 3 # Retry attempts for LLM judge (applies to GEval, custom metrics, etc.; default: 6 if not specified)
28+
num_retries: 3
2929
parameters:
3030
temperature: 0.0
3131
max_completion_tokens: 1024

src/lightspeed_evaluation/core/llm/deepeval.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010

1111
import litellm
1212
from deepeval.models import LiteLLMModel
13+
from tenacity import stop_after_attempt
1314

15+
from lightspeed_evaluation.core.constants import DEFAULT_LLM_RETRIES
1416
from lightspeed_evaluation.core.llm.litellm_patch import setup_litellm_ssl
1517

1618
logger = logging.getLogger(__name__)
@@ -46,15 +48,46 @@ def __init__(self, model_name: str, llm_params: dict[str, Any]):
4648
# LiteLLMModel stores **kwargs in self.kwargs and merges them into
4749
# every litellm.completion() call
4850
# Note: Forbidden keys are rejected at LLMParametersConfig load time
51+
52+
# Override DeepEval's hardcoded retry logic with user configuration
53+
# DeepEval uses @retry decorators that capture MAX_RETRIES at import time
54+
# We must patch the retry decorators after import but before instantiation
55+
num_retries = self.llm_params.get("num_retries", DEFAULT_LLM_RETRIES)
56+
57+
self._patch_deepeval_retries(num_retries)
58+
4959
self.llm_model = LiteLLMModel(
5060
model=self.model_name,
5161
timeout=self.llm_params.get("timeout"),
52-
num_retries=self.llm_params.get("num_retries"),
5362
**self.llm_params.get("parameters", {}),
5463
)
5564

5665
print(f"✅ DeepEval LLM Manager: {self.model_name}")
5766

67+
def _patch_deepeval_retries(self, max_retries: int) -> None:
68+
"""Monkey-patch DeepEval's retry decorators to use configured max_retries.
69+
70+
DeepEval's @retry decorators capture MAX_RETRIES at import time.
71+
We patch the 'stop' attribute on each retry decorator to use our value.
72+
"""
73+
# Patch the stop condition on all retry-decorated methods
74+
for method_name in [
75+
"generate",
76+
"a_generate",
77+
"generate_raw_response",
78+
"a_generate_raw_response",
79+
"generate_samples",
80+
]:
81+
method = getattr(LiteLLMModel, method_name)
82+
method.retry.stop = stop_after_attempt( # pylint: disable=no-member
83+
max_retries
84+
)
85+
86+
logger.info(
87+
"Patched DeepEval retry logic: max_retries=%d",
88+
max_retries,
89+
)
90+
5891
def setup_ssl_verify(self) -> None:
5992
"""Setup SSL verification based on LLM parameters.
6093

src/lightspeed_evaluation/core/metrics/deepeval.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,17 +87,34 @@ def _build_conversational_test_case(self, conv_data: Any) -> ConversationalTestC
8787

8888
return ConversationalTestCase(turns=turns)
8989

90-
def _evaluate_metric(self, metric: Any, test_case: Any) -> tuple[float, str]:
90+
def _evaluate_metric(self, metric: Any, test_case: Any) -> tuple[float | None, str]:
9191
"""Evaluate and get result."""
9292
metric.measure(test_case)
9393
self.llm_manager.flush_deepevals_pending_tasks()
9494

95+
score = metric.score
9596
reason = (
9697
metric.reason
9798
if hasattr(metric, "reason") and metric.reason
98-
else f"Score: {metric.score:.2f}"
99+
else f"Score: {score:.2f}" if score is not None else "No score returned"
99100
)
100-
return metric.score, reason
101+
102+
# CRITICAL: Warn if score is None (indicates evaluation failure)
103+
# None scores indicate evaluation failures that need investigation:
104+
# - Rate limiting (429 errors)
105+
# - LLM judge returning malformed JSON that fails parsing
106+
# - Timeout errors from LLM provider
107+
# - API quota/credits exhausted
108+
if score is None:
109+
logger.warning(
110+
"%s metric returned None score. "
111+
"This typically indicates LLM judge failure (rate limiting, timeout, "
112+
"invalid JSON response, or quota exhausted). Reason: %s",
113+
metric.__class__.__name__,
114+
reason,
115+
)
116+
117+
return score, reason
101118

102119
def evaluate(
103120
self,
@@ -117,7 +134,8 @@ def evaluate(
117134
scope: EvaluationScope containing turn info and conversation flag
118135
119136
Returns:
120-
Tuple of (score, reason)
137+
tuple[float | None, str]: Tuple of (score, reason).
138+
Score is in [0, 1] or None if evaluation failed.
121139
"""
122140
# Route to standard DeepEval metrics
123141
if metric_name in self.supported_metrics:

src/lightspeed_evaluation/core/metrics/geval.py

Lines changed: 8 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,6 @@
2222
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
2323

2424
from pydantic import ValidationError
25-
from tenacity import (
26-
retry,
27-
retry_if_exception,
28-
stop_after_attempt,
29-
wait_exponential,
30-
before_sleep_log,
31-
)
3225

3326
from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager
3427
from lightspeed_evaluation.core.metrics.manager import MetricLevel, MetricManager
@@ -64,11 +57,6 @@ def __init__(
6457
self.deepeval_llm_manager = deepeval_llm_manager
6558
self.metric_manager = metric_manager
6659

67-
# Get num_retries from LLM configuration (default: 6 to match DeepEval's hardcoded value)
68-
# Note: DeepEval's internal retry logic uses hardcoded MAX_RETRIES=6,
69-
# but we add our own retry layer to respect user configuration
70-
self.num_retries = self.deepeval_llm_manager.llm_params.get("num_retries", 6)
71-
7260
def evaluate( # pylint: disable=R0913,R0917
7361
self,
7462
metric_name: str,
@@ -214,67 +202,6 @@ def _convert_evaluation_params(
214202
# Return the successfully converted list, or None if it ended up empty
215203
return converted if converted else None
216204

217-
def _is_retryable_exception(self, exception: BaseException) -> bool:
218-
"""Check if exception should trigger a retry.
219-
220-
Retryable conditions:
221-
- Rate limiting (429 errors from LLM provider)
222-
- Timeout errors
223-
- Temporary network failures
224-
- LLM provider temporary errors
225-
226-
Args:
227-
exception: The exception to check
228-
229-
Returns:
230-
True if the exception should trigger a retry, False otherwise
231-
"""
232-
# We retry on all exceptions because DeepEval/LiteLLM internally
233-
# handles specific error types (RateLimitError, Timeout, etc.)
234-
# This matches DeepEval's hardcoded behavior: retryable_exceptions = (Exception,)
235-
return isinstance(exception, Exception)
236-
237-
def _measure_with_retry(
238-
self, metric: GEval, test_case: LLMTestCase, context: str
239-
) -> None:
240-
"""Execute metric.measure() with configurable retry logic.
241-
242-
This method wraps DeepEval's metric.measure() with our own retry layer
243-
to respect user-configured num_retries (DeepEval hardcodes MAX_RETRIES=6).
244-
245-
Args:
246-
metric: GEval metric instance
247-
test_case: LLM test case to evaluate
248-
context: Description for logging (e.g., "turn-level" or "conversation-level")
249-
250-
Raises:
251-
Exception: Re-raises the last exception if all retry attempts fail
252-
"""
253-
retry_decorator = retry(
254-
retry=retry_if_exception(self._is_retryable_exception),
255-
stop=stop_after_attempt(self.num_retries),
256-
wait=wait_exponential(multiplier=1, min=1, max=10),
257-
before_sleep=before_sleep_log(logger, logging.WARNING),
258-
reraise=True,
259-
)
260-
261-
@retry_decorator
262-
def _measure() -> None:
263-
metric.measure(test_case)
264-
self.deepeval_llm_manager.flush_deepevals_pending_tasks()
265-
266-
try:
267-
_measure()
268-
except Exception as e:
269-
logger.error(
270-
"GEval %s evaluation failed after %d retry attempts: %s: %s",
271-
context,
272-
self.num_retries,
273-
type(e).__name__,
274-
str(e),
275-
)
276-
raise
277-
278205
def _evaluate_turn( # pylint: disable=R0913,R0917
279206
self,
280207
turn_data: Any,
@@ -368,7 +295,8 @@ def _evaluate_turn( # pylint: disable=R0913,R0917
368295

369296
# Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is)
370297
try:
371-
self._measure_with_retry(metric, test_case, "turn-level")
298+
metric.measure(test_case)
299+
self.deepeval_llm_manager.flush_deepevals_pending_tasks()
372300

373301
# Extract score and reason
374302
score = metric.score
@@ -379,21 +307,19 @@ def _evaluate_turn( # pylint: disable=R0913,R0917
379307
)
380308

381309
# CRITICAL: Warn if score is None (indicates evaluation failure)
382-
# Without this warning, silent conversion to 0.0 masks bugs like:
310+
# None scores indicate evaluation failures that need investigation:
383311
# - Rate limiting (429 errors after all retries exhausted)
384312
# - LLM judge returning malformed JSON that fails parsing
385313
# - Timeout errors from LLM provider
386314
# - API quota/credits exhausted
387-
# This makes debugging nearly impossible as failed evaluations
388-
# appear as low scores (0.0) instead of errors.
315+
# Warning helps identify these failures for debugging.
389316
if score is None:
390317
logger.warning(
391-
"GEval turn-level metric returned None score; defaulting to 0.0. "
318+
"GEval turn-level metric returned None score. "
392319
"This typically indicates LLM judge failure (rate limiting, timeout, "
393320
"invalid JSON response, or quota exhausted). Reason: %s",
394321
reason,
395322
)
396-
score = 0.0
397323

398324
return score, reason
399325
except Exception as e: # pylint: disable=W0718
@@ -496,7 +422,8 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914
496422

497423
# Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is)
498424
try:
499-
self._measure_with_retry(metric, test_case, "conversation-level")
425+
metric.measure(test_case)
426+
self.deepeval_llm_manager.flush_deepevals_pending_tasks()
500427

501428
# Extract score and reason
502429
score = metric.score
@@ -510,12 +437,11 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914
510437
# See turn-level evaluation for detailed explanation of why this matters
511438
if score is None:
512439
logger.warning(
513-
"GEval conversation-level metric returned None score; defaulting to 0.0. "
440+
"GEval conversation-level metric returned None score. "
514441
"This typically indicates LLM judge failure (rate limiting, timeout, "
515442
"invalid JSON response, or quota exhausted). Reason: %s",
516443
reason,
517444
)
518-
score = 0.0
519445

520446
return score, reason
521447
except Exception as e: # pylint: disable=W0718

tests/unit/core/llm/test_deepeval_manager.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Unit tests for DeepEval LLM Manager."""
22

3+
import logging
4+
35
import pytest
46
from pytest_mock import MockerFixture
57

@@ -95,3 +97,62 @@ def test_drop_params_always_enabled(self, mocker: MockerFixture) -> None:
9597
DeepEvalLLMManager("gpt-4", {})
9698

9799
assert mock_litellm.drop_params is True
100+
101+
def test_patch_deepeval_retries_called_with_configured_value(
102+
self, mocker: MockerFixture
103+
) -> None:
104+
"""Test that _patch_deepeval_retries patches LiteLLMModel retry logic."""
105+
# Mock LiteLLMModel with methods that have retry decorators
106+
mock_generate = mocker.Mock()
107+
mock_generate.retry = mocker.Mock()
108+
mock_a_generate = mocker.Mock()
109+
mock_a_generate.retry = mocker.Mock()
110+
mock_generate_raw = mocker.Mock()
111+
mock_generate_raw.retry = mocker.Mock()
112+
mock_a_generate_raw = mocker.Mock()
113+
mock_a_generate_raw.retry = mocker.Mock()
114+
mock_generate_samples = mocker.Mock()
115+
mock_generate_samples.retry = mocker.Mock()
116+
117+
mock_litellm_class = mocker.patch(
118+
"lightspeed_evaluation.core.llm.deepeval.LiteLLMModel"
119+
)
120+
mock_litellm_class.generate = mock_generate
121+
mock_litellm_class.a_generate = mock_a_generate
122+
mock_litellm_class.generate_raw_response = mock_generate_raw
123+
mock_litellm_class.a_generate_raw_response = mock_a_generate_raw
124+
mock_litellm_class.generate_samples = mock_generate_samples
125+
126+
# Mock stop_after_attempt to return a mock stop condition
127+
mock_stop_condition = mocker.Mock()
128+
mock_stop_after_attempt = mocker.patch(
129+
"lightspeed_evaluation.core.llm.deepeval.stop_after_attempt",
130+
return_value=mock_stop_condition,
131+
)
132+
133+
params = {"num_retries": 5}
134+
DeepEvalLLMManager("gpt-4", params)
135+
136+
# Verify stop_after_attempt was called 5 times (once per method) with the configured retries
137+
assert mock_stop_after_attempt.call_count == 5
138+
mock_stop_after_attempt.assert_called_with(5)
139+
140+
# Verify each method's retry.stop was set to the new stop condition
141+
assert mock_generate.retry.stop == mock_stop_condition
142+
assert mock_a_generate.retry.stop == mock_stop_condition
143+
assert mock_generate_raw.retry.stop == mock_stop_condition
144+
assert mock_a_generate_raw.retry.stop == mock_stop_condition
145+
assert mock_generate_samples.retry.stop == mock_stop_condition
146+
147+
def test_patch_deepeval_retries_logs_operation(
148+
self, mocker: MockerFixture, caplog: pytest.LogCaptureFixture
149+
) -> None:
150+
"""Test that retry patching logs the max_retries value."""
151+
mocker.patch("lightspeed_evaluation.core.llm.deepeval.LiteLLMModel")
152+
153+
params = {"num_retries": 7}
154+
155+
with caplog.at_level(logging.INFO):
156+
DeepEvalLLMManager("gpt-4", params)
157+
158+
assert "Patched DeepEval retry logic: max_retries=7" in caplog.text

0 commit comments

Comments
 (0)