lightspeed-core
diff --git a/‎bug_test_geval_score_mismatch.yaml‎
Lines changed: 0 additions & 54 deletions b/‎bug_test_geval_score_mismatch.yaml‎
Lines changed: 0 additions & 54 deletions
diff --git a/‎config/system.yaml‎
Lines changed: 1 addition & 1 deletion b/‎config/system.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lightspeed_evaluation/core/llm/deepeval.py‎
Lines changed: 34 additions & 1 deletion b/‎src/lightspeed_evaluation/core/llm/deepeval.py‎
Lines changed: 34 additions & 1 deletion
diff --git a/‎src/lightspeed_evaluation/core/metrics/deepeval.py‎
Lines changed: 22 additions & 4 deletions b/‎src/lightspeed_evaluation/core/metrics/deepeval.py‎
Lines changed: 22 additions & 4 deletions
diff --git a/‎src/lightspeed_evaluation/core/metrics/geval.py‎
Lines changed: 16 additions & 82 deletions b/‎src/lightspeed_evaluation/core/metrics/geval.py‎
Lines changed: 16 additions & 82 deletions
@@ -25,7 +25,7 @@ core:
 llm_pool:
   defaults:
     timeout: 300
-    num_retries: 3  # Retry attempts for LLM judge (applies to GEval, custom metrics, etc.; default: 6 if not specified)
+    num_retries: 3
     parameters:
       temperature: 0.0
       max_completion_tokens: 1024
 
@@ -10,7 +10,9 @@
 
 import litellm
 from deepeval.models import LiteLLMModel
+from tenacity import stop_after_attempt
 
+from lightspeed_evaluation.core.constants import DEFAULT_LLM_RETRIES
 from lightspeed_evaluation.core.llm.litellm_patch import setup_litellm_ssl
 
 logger = logging.getLogger(__name__)
@@ -46,15 +48,46 @@ def __init__(self, model_name: str, llm_params: dict[str, Any]):
         # LiteLLMModel stores **kwargs in self.kwargs and merges them into
         # every litellm.completion() call
         # Note: Forbidden keys are rejected at LLMParametersConfig load time
+
+        # Override DeepEval's hardcoded retry logic with user configuration
+        # DeepEval uses @retry decorators that capture MAX_RETRIES at import time
+        # We must patch the retry decorators after import but before instantiation
+        num_retries = self.llm_params.get("num_retries", DEFAULT_LLM_RETRIES)
+
+        self._patch_deepeval_retries(num_retries)
+
         self.llm_model = LiteLLMModel(
             model=self.model_name,
             timeout=self.llm_params.get("timeout"),
-            num_retries=self.llm_params.get("num_retries"),
             **self.llm_params.get("parameters", {}),
         )
 
         print(f"✅ DeepEval LLM Manager: {self.model_name}")
 
+    def _patch_deepeval_retries(self, max_retries: int) -> None:
+        """Monkey-patch DeepEval's retry decorators to use configured max_retries.
+
+        DeepEval's @retry decorators capture MAX_RETRIES at import time.
+        We patch the 'stop' attribute on each retry decorator to use our value.
+        """
+        # Patch the stop condition on all retry-decorated methods
+        for method_name in [
+            "generate",
+            "a_generate",
+            "generate_raw_response",
+            "a_generate_raw_response",
+            "generate_samples",
+        ]:
+            method = getattr(LiteLLMModel, method_name)
+            method.retry.stop = stop_after_attempt(  # pylint: disable=no-member
+                max_retries
+            )
+
+        logger.info(
+            "Patched DeepEval retry logic: max_retries=%d",
+            max_retries,
+        )
+
     def setup_ssl_verify(self) -> None:
         """Setup SSL verification based on LLM parameters.
 
 
@@ -87,17 +87,34 @@ def _build_conversational_test_case(self, conv_data: Any) -> ConversationalTestC
 
         return ConversationalTestCase(turns=turns)
 
-    def _evaluate_metric(self, metric: Any, test_case: Any) -> tuple[float, str]:
+    def _evaluate_metric(self, metric: Any, test_case: Any) -> tuple[float | None, str]:
         """Evaluate and get result."""
         metric.measure(test_case)
         self.llm_manager.flush_deepevals_pending_tasks()
 
+        score = metric.score
         reason = (
             metric.reason
             if hasattr(metric, "reason") and metric.reason
-            else f"Score: {metric.score:.2f}"
+            else f"Score: {score:.2f}" if score is not None else "No score returned"
         )
-        return metric.score, reason
+
+        # CRITICAL: Warn if score is None (indicates evaluation failure)
+        # None scores indicate evaluation failures that need investigation:
+        # - Rate limiting (429 errors)
+        # - LLM judge returning malformed JSON that fails parsing
+        # - Timeout errors from LLM provider
+        # - API quota/credits exhausted
+        if score is None:
+            logger.warning(
+                "%s metric returned None score. "
+                "This typically indicates LLM judge failure (rate limiting, timeout, "
+                "invalid JSON response, or quota exhausted). Reason: %s",
+                metric.__class__.__name__,
+                reason,
+            )
+
+        return score, reason
 
     def evaluate(
         self,
@@ -117,7 +134,8 @@ def evaluate(
             scope: EvaluationScope containing turn info and conversation flag
 
         Returns:
-            Tuple of (score, reason)
+            tuple[float | None, str]: Tuple of (score, reason).
+                Score is in [0, 1] or None if evaluation failed.
         """
         # Route to standard DeepEval metrics
         if metric_name in self.supported_metrics:
 
@@ -21,13 +21,6 @@
 from deepeval.metrics.g_eval import Rubric
 from deepeval.test_case import LLMTestCase, LLMTestCaseParams
 from pydantic import ValidationError
-from tenacity import (
-    retry,
-    retry_if_exception,
-    stop_after_attempt,
-    wait_exponential,
-    before_sleep_log,
-)
 
 from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager
 from lightspeed_evaluation.core.metrics.manager import MetricLevel, MetricManager
@@ -63,11 +56,6 @@ def __init__(
         self.deepeval_llm_manager = deepeval_llm_manager
         self.metric_manager = metric_manager
 
-        # Get num_retries from LLM configuration (default: 6 to match DeepEval's hardcoded value)
-        # Note: DeepEval's internal retry logic uses hardcoded MAX_RETRIES=6,
-        # but we add our own retry layer to respect user configuration
-        self.num_retries = self.deepeval_llm_manager.llm_params.get("num_retries", 6)
-
     def evaluate(  # pylint: disable=R0913,R0917
         self,
         metric_name: str,
@@ -213,67 +201,6 @@ def _convert_evaluation_params(
         # Return the successfully converted list, or None if it ended up empty
         return converted if converted else None
 
-    def _is_retryable_exception(self, exception: BaseException) -> bool:
-        """Check if exception should trigger a retry.
-
-        Retryable conditions:
-        - Rate limiting (429 errors from LLM provider)
-        - Timeout errors
-        - Temporary network failures
-        - LLM provider temporary errors
-
-        Args:
-            exception: The exception to check
-
-        Returns:
-            True if the exception should trigger a retry, False otherwise
-        """
-        # We retry on all exceptions because DeepEval/LiteLLM internally
-        # handles specific error types (RateLimitError, Timeout, etc.)
-        # This matches DeepEval's hardcoded behavior: retryable_exceptions = (Exception,)
-        return isinstance(exception, Exception)
-
-    def _measure_with_retry(
-        self, metric: GEval, test_case: LLMTestCase, context: str
-    ) -> None:
-        """Execute metric.measure() with configurable retry logic.
-
-        This method wraps DeepEval's metric.measure() with our own retry layer
-        to respect user-configured num_retries (DeepEval hardcodes MAX_RETRIES=6).
-
-        Args:
-            metric: GEval metric instance
-            test_case: LLM test case to evaluate
-            context: Description for logging (e.g., "turn-level" or "conversation-level")
-
-        Raises:
-            Exception: Re-raises the last exception if all retry attempts fail
-        """
-        retry_decorator = retry(
-            retry=retry_if_exception(self._is_retryable_exception),
-            stop=stop_after_attempt(self.num_retries),
-            wait=wait_exponential(multiplier=1, min=1, max=10),
-            before_sleep=before_sleep_log(logger, logging.WARNING),
-            reraise=True,
-        )
-
-        @retry_decorator
-        def _measure() -> None:
-            metric.measure(test_case)
-            self.deepeval_llm_manager.flush_deepevals_pending_tasks()
-
-        try:
-            _measure()
-        except Exception as e:
-            logger.error(
-                "GEval %s evaluation failed after %d retry attempts: %s: %s",
-                context,
-                self.num_retries,
-                type(e).__name__,
-                str(e),
-            )
-            raise
-
     def _evaluate_turn(  # pylint: disable=R0913,R0917
         self,
         turn_data: Any,
@@ -367,7 +294,8 @@ def _evaluate_turn(  # pylint: disable=R0913,R0917
 
         # Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is)
         try:
-            self._measure_with_retry(metric, test_case, "turn-level")
+            metric.measure(test_case)
+            self.deepeval_llm_manager.flush_deepevals_pending_tasks()
 
             # Extract score and reason
             score = metric.score
@@ -378,24 +306,25 @@ def _evaluate_turn(  # pylint: disable=R0913,R0917
             )
 
             # CRITICAL: Warn if score is None (indicates evaluation failure)
-            # Without this warning, silent conversion to 0.0 masks bugs like:
+            # None scores indicate evaluation failures that need investigation:
             # - Rate limiting (429 errors after all retries exhausted)
             # - LLM judge returning malformed JSON that fails parsing
             # - Timeout errors from LLM provider
             # - API quota/credits exhausted
-            # This makes debugging nearly impossible as failed evaluations
-            # appear as low scores (0.0) instead of errors.
+            # Warning helps identify these failures for debugging.
             if score is None:
                 logger.warning(
-                    "GEval turn-level metric returned None score; defaulting to 0.0. "
+                    "GEval turn-level metric returned None score. "
                     "This typically indicates LLM judge failure (rate limiting, timeout, "
                     "invalid JSON response, or quota exhausted). Reason: %s",
                     reason,
                 )
-                score = 0.0
 
             return score, reason
         except Exception as e:  # pylint: disable=W0718
+            logger.error(
+                "GEval turn-level evaluation failed: %s: %s", type(e).__name__, str(e)
+            )
             logger.debug(
                 "Test case input: %s...",
                 test_case.input[:100] if test_case.input else "None",
@@ -495,7 +424,8 @@ def _evaluate_conversation(  # pylint: disable=R0913,R0917,R0914
 
         # Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is)
         try:
-            self._measure_with_retry(metric, test_case, "conversation-level")
+            metric.measure(test_case)
+            self.deepeval_llm_manager.flush_deepevals_pending_tasks()
 
             # Extract score and reason
             score = metric.score
@@ -509,15 +439,19 @@ def _evaluate_conversation(  # pylint: disable=R0913,R0917,R0914
             # See turn-level evaluation for detailed explanation of why this matters
             if score is None:
                 logger.warning(
-                    "GEval conversation-level metric returned None score; defaulting to 0.0. "
+                    "GEval conversation-level metric returned None score. "
                     "This typically indicates LLM judge failure (rate limiting, timeout, "
                     "invalid JSON response, or quota exhausted). Reason: %s",
                     reason,
                 )
-                score = 0.0
 
             return score, reason
         except Exception as e:  # pylint: disable=W0718
+            logger.error(
+                "GEval conversation-level evaluation failed: %s: %s",
+                type(e).__name__,
+                str(e),
+            )
             logger.debug("Conversation turns: %d", len(conv_data.turns))
             logger.debug(
                 "Test case input preview: %s...",