Deeppevals cache miss fixed

xmican10 · xmican10 · commit 282289da5383 · 2026-03-13T14:08:33.000+01:00
diff --git a/src/lightspeed_evaluation/core/llm/deepeval.py b/src/lightspeed_evaluation/core/llm/deepeval.py
@@ -4,12 +4,16 @@
 This ensures DeepEval's LiteLLMModel uses the patched completion functions.
 """
 
+import asyncio
+import logging
 import os
 from typing import Any
 
 import litellm
 from deepeval.models import LiteLLMModel
 
+logger = logging.getLogger(__name__)
+
 
 class DeepEvalLLMManager:
     """DeepEval LLM Manager - Takes LLM parameters directly.
@@ -68,3 +72,11 @@ def get_model_info(self) -> dict[str, Any]:
             "timeout": self.llm_params.get("timeout"),
             "num_retries": self.llm_params.get("num_retries", 3),
         }
+
+    @staticmethod
+    def flush_deepevals_pending_tasks() -> None:
+        """Flush background tasks left pending by DeepEvals async_mode."""
+        try:
+            asyncio.run(asyncio.sleep(0))
+        except RuntimeError as e:
+            logger.debug("Could not flush DeepEval pending tasks: %s", e)
diff --git a/src/lightspeed_evaluation/core/metrics/deepeval.py b/src/lightspeed_evaluation/core/metrics/deepeval.py
@@ -86,6 +86,7 @@ def _build_conversational_test_case(self, conv_data: Any) -> ConversationalTestC
     def _evaluate_metric(self, metric: Any, test_case: Any) -> tuple[float, str]:
         """Evaluate and get result."""
         metric.measure(test_case)
+        self.llm_manager.flush_deepevals_pending_tasks()
 
         reason = (
             metric.reason
diff --git a/src/lightspeed_evaluation/core/metrics/geval.py b/src/lightspeed_evaluation/core/metrics/geval.py
@@ -295,6 +295,8 @@ def _evaluate_turn(  # pylint: disable=R0913,R0917
         # Evaluate (DeepEval normalizes score to [0, 1]; pass through as-is)
         try:
             metric.measure(test_case)
+            self.deepeval_llm_manager.flush_deepevals_pending_tasks()
+
             score = metric.score if metric.score is not None else 0.0
             reason = (
                 str(metric.reason)
@@ -402,6 +404,8 @@ def _evaluate_conversation(  # pylint: disable=R0913,R0917,R0914
         # Evaluate (DeepEval normalizes score to [0, 1]; pass through as-is)
         try:
             metric.measure(test_case)
+            self.deepeval_llm_manager.flush_deepevals_pending_tasks()
+
             score = metric.score if metric.score is not None else 0.0
             reason = (
                 str(metric.reason)