Skip to content

Commit ffdf709

Browse files
authored
Merge pull request lightspeed-core#188 from xmican10/LEADS-253-deepeval-cache-miss
[LEADS-253] Deepevals cache miss fixed
2 parents 59749df + 282289d commit ffdf709

3 files changed

Lines changed: 17 additions & 0 deletions

File tree

src/lightspeed_evaluation/core/llm/deepeval.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,16 @@
44
This ensures DeepEval's LiteLLMModel uses the patched completion functions.
55
"""
66

7+
import asyncio
8+
import logging
79
import os
810
from typing import Any
911

1012
import litellm
1113
from deepeval.models import LiteLLMModel
1214

15+
logger = logging.getLogger(__name__)
16+
1317

1418
class DeepEvalLLMManager:
1519
"""DeepEval LLM Manager - Takes LLM parameters directly.
@@ -68,3 +72,11 @@ def get_model_info(self) -> dict[str, Any]:
6872
"timeout": self.llm_params.get("timeout"),
6973
"num_retries": self.llm_params.get("num_retries", 3),
7074
}
75+
76+
@staticmethod
77+
def flush_deepevals_pending_tasks() -> None:
78+
"""Flush background tasks left pending by DeepEvals async_mode."""
79+
try:
80+
asyncio.run(asyncio.sleep(0))
81+
except RuntimeError as e:
82+
logger.debug("Could not flush DeepEval pending tasks: %s", e)

src/lightspeed_evaluation/core/metrics/deepeval.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ def _build_conversational_test_case(self, conv_data: Any) -> ConversationalTestC
8686
def _evaluate_metric(self, metric: Any, test_case: Any) -> tuple[float, str]:
8787
"""Evaluate and get result."""
8888
metric.measure(test_case)
89+
self.llm_manager.flush_deepevals_pending_tasks()
8990

9091
reason = (
9192
metric.reason

src/lightspeed_evaluation/core/metrics/geval.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,8 @@ def _evaluate_turn( # pylint: disable=R0913,R0917
296296
# Evaluate (DeepEval normalizes score to [0, 1]; pass through as-is)
297297
try:
298298
metric.measure(test_case)
299+
self.deepeval_llm_manager.flush_deepevals_pending_tasks()
300+
299301
score = metric.score if metric.score is not None else 0.0
300302
reason = (
301303
str(metric.reason)
@@ -403,6 +405,8 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914
403405
# Evaluate (DeepEval normalizes score to [0, 1]; pass through as-is)
404406
try:
405407
metric.measure(test_case)
408+
self.deepeval_llm_manager.flush_deepevals_pending_tasks()
409+
406410
score = metric.score if metric.score is not None else 0.0
407411
reason = (
408412
str(metric.reason)

0 commit comments

Comments
 (0)