Skip to content

Commit b37303f

Browse files
committed
fix: Retry logic for Deepevals score=None bug
1 parent fda6b29 commit b37303f

8 files changed

Lines changed: 434 additions & 135 deletions

File tree

bug_test_geval_score_mismatch.yaml

Lines changed: 0 additions & 54 deletions
This file was deleted.

config/system.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ core:
2525
llm_pool:
2626
defaults:
2727
timeout: 300
28-
num_retries: 3 # Retry attempts for LLM judge (applies to GEval, custom metrics, etc.; default: 6 if not specified)
28+
num_retries: 3
2929
parameters:
3030
temperature: 0.0
3131
max_completion_tokens: 1024

src/lightspeed_evaluation/core/llm/deepeval.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010

1111
import litellm
1212
from deepeval.models import LiteLLMModel
13+
from tenacity import stop_after_attempt
1314

15+
from lightspeed_evaluation.core.constants import DEFAULT_LLM_RETRIES
1416
from lightspeed_evaluation.core.llm.litellm_patch import setup_litellm_ssl
1517

1618
logger = logging.getLogger(__name__)
@@ -46,15 +48,46 @@ def __init__(self, model_name: str, llm_params: dict[str, Any]):
4648
# LiteLLMModel stores **kwargs in self.kwargs and merges them into
4749
# every litellm.completion() call
4850
# Note: Forbidden keys are rejected at LLMParametersConfig load time
51+
52+
# Override DeepEval's hardcoded retry logic with user configuration
53+
# DeepEval uses @retry decorators that capture MAX_RETRIES at import time
54+
# We must patch the retry decorators after import but before instantiation
55+
num_retries = self.llm_params.get("num_retries", DEFAULT_LLM_RETRIES)
56+
57+
self._patch_deepeval_retries(num_retries)
58+
4959
self.llm_model = LiteLLMModel(
5060
model=self.model_name,
5161
timeout=self.llm_params.get("timeout"),
52-
num_retries=self.llm_params.get("num_retries"),
5362
**self.llm_params.get("parameters", {}),
5463
)
5564

5665
print(f"✅ DeepEval LLM Manager: {self.model_name}")
5766

67+
def _patch_deepeval_retries(self, max_retries: int) -> None:
68+
"""Monkey-patch DeepEval's retry decorators to use configured max_retries.
69+
70+
DeepEval's @retry decorators capture MAX_RETRIES at import time.
71+
We patch the 'stop' attribute on each retry decorator to use our value.
72+
"""
73+
# Patch the stop condition on all retry-decorated methods
74+
for method_name in [
75+
"generate",
76+
"a_generate",
77+
"generate_raw_response",
78+
"a_generate_raw_response",
79+
"generate_samples",
80+
]:
81+
method = getattr(LiteLLMModel, method_name)
82+
method.retry.stop = stop_after_attempt( # pylint: disable=no-member
83+
max_retries
84+
)
85+
86+
logger.info(
87+
"Patched DeepEval retry logic: max_retries=%d",
88+
max_retries,
89+
)
90+
5891
def setup_ssl_verify(self) -> None:
5992
"""Setup SSL verification based on LLM parameters.
6093

src/lightspeed_evaluation/core/metrics/deepeval.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,12 +92,30 @@ def _evaluate_metric(self, metric: Any, test_case: Any) -> tuple[float, str]:
9292
metric.measure(test_case)
9393
self.llm_manager.flush_deepevals_pending_tasks()
9494

95+
score = metric.score
9596
reason = (
9697
metric.reason
9798
if hasattr(metric, "reason") and metric.reason
98-
else f"Score: {metric.score:.2f}"
99+
else f"Score: {score:.2f}" if score is not None else "No score returned"
99100
)
100-
return metric.score, reason
101+
102+
# CRITICAL: Warn if score is None (indicates evaluation failure)
103+
# Without this warning, silent conversion to 0.0 masks bugs like:
104+
# - Rate limiting (429 errors)
105+
# - LLM judge returning malformed JSON that fails parsing
106+
# - Timeout errors from LLM provider
107+
# - API quota/credits exhausted
108+
if score is None:
109+
logger.warning(
110+
"%s metric returned None score; defaulting to 0.0. "
111+
"This typically indicates LLM judge failure (rate limiting, timeout, "
112+
"invalid JSON response, or quota exhausted). Reason: %s",
113+
metric.__class__.__name__,
114+
reason,
115+
)
116+
score = 0.0
117+
118+
return score, reason
101119

102120
def evaluate(
103121
self,

src/lightspeed_evaluation/core/metrics/geval.py

Lines changed: 14 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,6 @@
2222
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
2323

2424
from pydantic import ValidationError
25-
from tenacity import (
26-
retry,
27-
retry_if_exception,
28-
stop_after_attempt,
29-
wait_exponential,
30-
before_sleep_log,
31-
)
3225

3326
from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager
3427
from lightspeed_evaluation.core.metrics.manager import MetricLevel, MetricManager
@@ -64,11 +57,6 @@ def __init__(
6457
self.deepeval_llm_manager = deepeval_llm_manager
6558
self.metric_manager = metric_manager
6659

67-
# Get num_retries from LLM configuration (default: 6 to match DeepEval's hardcoded value)
68-
# Note: DeepEval's internal retry logic uses hardcoded MAX_RETRIES=6,
69-
# but we add our own retry layer to respect user configuration
70-
self.num_retries = self.deepeval_llm_manager.llm_params.get("num_retries", 6)
71-
7260
def evaluate( # pylint: disable=R0913,R0917
7361
self,
7462
metric_name: str,
@@ -214,67 +202,6 @@ def _convert_evaluation_params(
214202
# Return the successfully converted list, or None if it ended up empty
215203
return converted if converted else None
216204

217-
def _is_retryable_exception(self, exception: BaseException) -> bool:
218-
"""Check if exception should trigger a retry.
219-
220-
Retryable conditions:
221-
- Rate limiting (429 errors from LLM provider)
222-
- Timeout errors
223-
- Temporary network failures
224-
- LLM provider temporary errors
225-
226-
Args:
227-
exception: The exception to check
228-
229-
Returns:
230-
True if the exception should trigger a retry, False otherwise
231-
"""
232-
# We retry on all exceptions because DeepEval/LiteLLM internally
233-
# handles specific error types (RateLimitError, Timeout, etc.)
234-
# This matches DeepEval's hardcoded behavior: retryable_exceptions = (Exception,)
235-
return isinstance(exception, Exception)
236-
237-
def _measure_with_retry(
238-
self, metric: GEval, test_case: LLMTestCase, context: str
239-
) -> None:
240-
"""Execute metric.measure() with configurable retry logic.
241-
242-
This method wraps DeepEval's metric.measure() with our own retry layer
243-
to respect user-configured num_retries (DeepEval hardcodes MAX_RETRIES=6).
244-
245-
Args:
246-
metric: GEval metric instance
247-
test_case: LLM test case to evaluate
248-
context: Description for logging (e.g., "turn-level" or "conversation-level")
249-
250-
Raises:
251-
Exception: Re-raises the last exception if all retry attempts fail
252-
"""
253-
retry_decorator = retry(
254-
retry=retry_if_exception(self._is_retryable_exception),
255-
stop=stop_after_attempt(self.num_retries),
256-
wait=wait_exponential(multiplier=1, min=1, max=10),
257-
before_sleep=before_sleep_log(logger, logging.WARNING),
258-
reraise=True,
259-
)
260-
261-
@retry_decorator
262-
def _measure() -> None:
263-
metric.measure(test_case)
264-
self.deepeval_llm_manager.flush_deepevals_pending_tasks()
265-
266-
try:
267-
_measure()
268-
except Exception as e:
269-
logger.error(
270-
"GEval %s evaluation failed after %d retry attempts: %s: %s",
271-
context,
272-
self.num_retries,
273-
type(e).__name__,
274-
str(e),
275-
)
276-
raise
277-
278205
def _evaluate_turn( # pylint: disable=R0913,R0917
279206
self,
280207
turn_data: Any,
@@ -366,9 +293,10 @@ def _evaluate_turn( # pylint: disable=R0913,R0917
366293
# Create test case for a single turn
367294
test_case = LLMTestCase(**test_case_kwargs)
368295

369-
# Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is)
296+
# Evaluate (DeepEval normalizes score to [0, 1]; pass through as-is)
370297
try:
371-
self._measure_with_retry(metric, test_case, "turn-level")
298+
metric.measure(test_case)
299+
self.deepeval_llm_manager.flush_deepevals_pending_tasks()
372300

373301
# Extract score and reason
374302
score = metric.score
@@ -397,6 +325,9 @@ def _evaluate_turn( # pylint: disable=R0913,R0917
397325

398326
return score, reason
399327
except Exception as e: # pylint: disable=W0718
328+
logger.error(
329+
"GEval turn-level evaluation failed: %s: %s", type(e).__name__, str(e)
330+
)
400331
logger.debug(
401332
"Test case input: %s...",
402333
test_case.input[:100] if test_case.input else "None",
@@ -494,9 +425,10 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914
494425
actual_output="\n".join(conversation_output),
495426
)
496427

497-
# Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is)
428+
# Evaluate (DeepEval normalizes score to [0, 1]; pass through as-is)
498429
try:
499-
self._measure_with_retry(metric, test_case, "conversation-level")
430+
metric.measure(test_case)
431+
self.deepeval_llm_manager.flush_deepevals_pending_tasks()
500432

501433
# Extract score and reason
502434
score = metric.score
@@ -519,6 +451,11 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914
519451

520452
return score, reason
521453
except Exception as e: # pylint: disable=W0718
454+
logger.error(
455+
"GEval conversation-level evaluation failed: %s: %s",
456+
type(e).__name__,
457+
str(e),
458+
)
522459
logger.debug("Conversation turns: %d", len(conv_data.turns))
523460
logger.debug(
524461
"Test case input preview: %s...",

tests/unit/core/llm/test_deepeval_manager.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Unit tests for DeepEval LLM Manager."""
22

3+
import logging
4+
35
import pytest
46
from pytest_mock import MockerFixture
57

@@ -95,3 +97,62 @@ def test_drop_params_always_enabled(self, mocker: MockerFixture) -> None:
9597
DeepEvalLLMManager("gpt-4", {})
9698

9799
assert mock_litellm.drop_params is True
100+
101+
def test_patch_deepeval_retries_called_with_configured_value(
102+
self, mocker: MockerFixture
103+
) -> None:
104+
"""Test that _patch_deepeval_retries patches LiteLLMModel retry logic."""
105+
# Mock LiteLLMModel with methods that have retry decorators
106+
mock_generate = mocker.Mock()
107+
mock_generate.retry = mocker.Mock()
108+
mock_a_generate = mocker.Mock()
109+
mock_a_generate.retry = mocker.Mock()
110+
mock_generate_raw = mocker.Mock()
111+
mock_generate_raw.retry = mocker.Mock()
112+
mock_a_generate_raw = mocker.Mock()
113+
mock_a_generate_raw.retry = mocker.Mock()
114+
mock_generate_samples = mocker.Mock()
115+
mock_generate_samples.retry = mocker.Mock()
116+
117+
mock_litellm_class = mocker.patch(
118+
"lightspeed_evaluation.core.llm.deepeval.LiteLLMModel"
119+
)
120+
mock_litellm_class.generate = mock_generate
121+
mock_litellm_class.a_generate = mock_a_generate
122+
mock_litellm_class.generate_raw_response = mock_generate_raw
123+
mock_litellm_class.a_generate_raw_response = mock_a_generate_raw
124+
mock_litellm_class.generate_samples = mock_generate_samples
125+
126+
# Mock stop_after_attempt to return a mock stop condition
127+
mock_stop_condition = mocker.Mock()
128+
mock_stop_after_attempt = mocker.patch(
129+
"lightspeed_evaluation.core.llm.deepeval.stop_after_attempt",
130+
return_value=mock_stop_condition,
131+
)
132+
133+
params = {"num_retries": 5}
134+
DeepEvalLLMManager("gpt-4", params)
135+
136+
# Verify stop_after_attempt was called 5 times (once per method) with the configured retries
137+
assert mock_stop_after_attempt.call_count == 5
138+
mock_stop_after_attempt.assert_called_with(5)
139+
140+
# Verify each method's retry.stop was set to the new stop condition
141+
assert mock_generate.retry.stop == mock_stop_condition
142+
assert mock_a_generate.retry.stop == mock_stop_condition
143+
assert mock_generate_raw.retry.stop == mock_stop_condition
144+
assert mock_a_generate_raw.retry.stop == mock_stop_condition
145+
assert mock_generate_samples.retry.stop == mock_stop_condition
146+
147+
def test_patch_deepeval_retries_logs_operation(
148+
self, mocker: MockerFixture, caplog: pytest.LogCaptureFixture
149+
) -> None:
150+
"""Test that retry patching logs the max_retries value."""
151+
mocker.patch("lightspeed_evaluation.core.llm.deepeval.LiteLLMModel")
152+
153+
params = {"num_retries": 7}
154+
155+
with caplog.at_level(logging.INFO):
156+
DeepEvalLLMManager("gpt-4", params)
157+
158+
assert "Patched DeepEval retry logic: max_retries=7" in caplog.text

0 commit comments

Comments
 (0)