feat: Add ability to evaluate ragas metrics asynchronously

maxdswain · maxdswain · commit 9559e31e6a17 · 2026-04-27T17:04:10.000+01:00
diff --git a/integrations/ragas/pyproject.toml b/integrations/ragas/pyproject.toml
@@ -82,7 +82,7 @@ ignore_missing_imports = true
 
 [tool.ruff]
 line-length = 120
-exclude = ["example", "tests"]
+exclude = ["example"]
 
 [tool.ruff.lint]
 select = [
@@ -151,7 +151,7 @@ ban-relative-imports = "all"
 
 [tool.ruff.lint.per-file-ignores]
 # Tests can use magic values, assertions, and relative imports
-"tests/**/*" = ["D", "PLR2004", "S101", "TID252", "ANN"]
+"tests/**/*" = ["D", "PLR2004", "S101", "TID252", "ANN", "ARG"]
 
 [tool.coverage.run]
 source = ["haystack_integrations"]
diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py
@@ -1,4 +1,5 @@
 import inspect
+from asyncio import Semaphore, gather
 from typing import Any, Union, cast, get_args, get_origin
 
 from haystack import Document, component, default_from_dict, default_to_dict
@@ -50,17 +51,20 @@ class RagasEvaluator:
     ```
     """
 
-    def __init__(self, ragas_metrics: list[SimpleBaseMetric]) -> None:
+    def __init__(self, ragas_metrics: list[SimpleBaseMetric], concurrency_limit: int = 4) -> None:
         """
         Constructs a new Ragas evaluator.
 
         :param ragas_metrics: A list of modern Ragas metrics from `ragas.metrics.collections`.
             Each metric must be fully configured (including its LLM) at construction time.
             Available metrics can be found in the
             [Ragas documentation](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/).
+        :param concurrency_limit:
+            The maximum number of metric evaluations that should be allowed to run concurrently.
         """
         self._validate_inputs(ragas_metrics)
         self.metrics = ragas_metrics
+        self.concurrency_limit = concurrency_limit
 
     @staticmethod
     def _validate_inputs(metrics: list[SimpleBaseMetric]) -> None:
@@ -148,6 +152,57 @@ def run(
 
         return {"result": results}
 
+    @component.output_types(result=dict[str, dict[str, MetricResult]])
+    async def run_async(
+        self,
+        query: str | None = None,
+        response: list[ChatMessage] | str | None = None,
+        documents: list[Document | str] | None = None,
+        reference_contexts: list[str] | None = None,
+        multi_responses: list[str] | None = None,
+        reference: str | None = None,
+        rubrics: dict[str, str] | None = None,
+    ) -> dict[str, dict[str, MetricResult]]:
+        """
+        Asynchronously evaluates the provided inputs against each metric and returns the results.
+
+        :param query: The input query from the user.
+        :param response: A list of ChatMessage responses (typically from a language model or agent).
+        :param documents: A list of Haystack Document or strings that were retrieved for the query.
+        :param reference_contexts: A list of reference contexts that should have been retrieved for the query.
+        :param multi_responses: List of multiple responses generated for the query.
+        :param reference: A string reference answer for the query.
+        :param rubrics: A dictionary of evaluation rubric, where keys represent the score
+                        and the values represent the corresponding evaluation criteria.
+        :return: A dictionary with key `result` mapping metric names to their `MetricResult`.
+        """
+        processed_docs = self._process_documents(documents)
+        processed_response = self._process_response(response)
+
+        try:
+            sample = SingleTurnSample(
+                user_input=query,
+                retrieved_contexts=processed_docs,
+                reference_contexts=reference_contexts,
+                response=processed_response,
+                multi_responses=multi_responses,
+                reference=reference,
+                rubrics=rubrics,
+            )
+        except ValidationError as e:
+            self._handle_conversion_error(e)
+
+        sem = Semaphore(max(1, self.concurrency_limit))
+
+        async def _runner(metric: SimpleBaseMetric) -> tuple[str, MetricResult]:
+            async with sem:
+                return metric.name, await self._score_metric_async(metric, sample)
+
+        pairs = await gather(*[_runner(m) for m in self.metrics])
+        results: dict[str, MetricResult] = dict(pairs)
+
+        return {"result": results}
+
     def _score_metric(self, metric: SimpleBaseMetric, sample: SingleTurnSample) -> MetricResult:
         """
         Score a metric by inspecting its ascore() signature and passing only matching sample fields.
@@ -168,6 +223,26 @@ def _score_metric(self, metric: SimpleBaseMetric, sample: SingleTurnSample) -> M
         kwargs = {k: v for k, v in sample_dict.items() if k in valid_params and v is not None}
         return metric.score(**kwargs)
 
+    async def _score_metric_async(self, metric: SimpleBaseMetric, sample: SingleTurnSample) -> MetricResult:
+        """
+        Score a metric by inspecting its ascore() signature and passing only matching sample fields.
+
+        :param metric: A SimpleBaseMetric instance to score.
+        :param sample: The SingleTurnSample holding all available input fields.
+        :return: MetricResult from the metric.
+        """
+        sig = inspect.signature(metric.ascore)
+        excluded = {"self", "callbacks"}
+        valid_params = {
+            name
+            for name, param in sig.parameters.items()
+            if name not in excluded
+            and param.kind not in (inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD)
+        }
+        sample_dict = sample.model_dump()
+        kwargs = {k: v for k, v in sample_dict.items() if k in valid_params and v is not None}
+        return await metric.ascore(**kwargs)
+
     def _process_documents(self, documents: list[Document | str] | None) -> list[str] | None:
         """
         Process and validate input documents.
diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py
@@ -1,5 +1,6 @@
+import inspect
 import os
-from unittest.mock import MagicMock
+from unittest.mock import AsyncMock, MagicMock
 
 import pytest
 from haystack import Document, Pipeline
@@ -47,6 +48,20 @@ async def ascore(user_input: str, response: str, retrieved_contexts: list) -> Me
     return metric
 
 
+def make_metric_async(name: str, score: float = 0.8, reason: str = "test reason") -> MagicMock:
+    """Create a mock SimpleBaseMetric with a concrete ascore signature for inspect.signature."""
+    metric = MagicMock(spec=SimpleBaseMetric)
+    metric.name = name
+
+    async def ascore(user_input: str, response: str, retrieved_contexts: list) -> MetricResult:
+        return MetricResult(value=score, reason=reason)
+
+    mock_ascore = AsyncMock(return_value=MetricResult(value=score, reason=reason))
+    mock_ascore.__signature__ = inspect.signature(ascore)
+    metric.ascore = mock_ascore
+    return metric
+
+
 class TestInit:
     def test_init(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test")
@@ -67,7 +82,7 @@ def test_init_with_multiple_metrics(self, monkeypatch):
         assert len(evaluator.metrics) == 2
 
     def test_invalid_metrics_raises_type_error(self):
-        with pytest.raises(TypeError, match="All items in ragas_metrics must be instances of SimpleBaseMetric."):
+        with pytest.raises(TypeError, match=r"All items in ragas_metrics must be instances of SimpleBaseMetric."):
             RagasEvaluator(ragas_metrics=["not_a_metric"])
 
 
@@ -167,6 +182,119 @@ def test_run_raises_on_invalid_input_types(self, invalid_input, field_name, erro
         assert error_message in str(exc_info.value)
 
 
+class TestRunAsync:
+    @pytest.mark.asyncio
+    async def test_run_async_returns_result_by_metric_name(self) -> None:
+        metric = make_metric_async("faithfulness", score=0.9)
+        evaluator = RagasEvaluator(ragas_metrics=[metric])
+        output = await evaluator.run_async(
+            query="Which is the most popular global sport?",
+            response="Football is the most popular sport.",
+            documents=["Football is undoubtedly the world's most popular sport."],
+        )
+        assert "result" in output
+        assert "faithfulness" in output["result"]
+        result = output["result"]["faithfulness"]
+        assert isinstance(result, MetricResult)
+        assert result.value == 0.9
+
+    @pytest.mark.asyncio
+    async def test_run_async_scores_all_metrics(self) -> None:
+        metrics = [make_metric_async("faithfulness", 0.9), make_metric_async("answer_relevancy", 0.7)]
+        evaluator = RagasEvaluator(ragas_metrics=metrics)
+        output = await evaluator.run_async(query="test?", response="answer", documents=["doc"])
+        assert set(output["result"].keys()) == {"faithfulness", "answer_relevancy"}
+        assert output["result"]["faithfulness"].value == 0.9
+        assert output["result"]["answer_relevancy"].value == 0.7
+
+    @pytest.mark.asyncio
+    async def test_run_async_calls_ascore_on_each_metric(self) -> None:
+        metric_a = make_metric_async("faithfulness")
+        metric_b = make_metric_async("answer_relevancy")
+        evaluator = RagasEvaluator(ragas_metrics=[metric_a, metric_b])
+        await evaluator.run_async(query="test?", response="answer", documents=["doc"])
+        metric_a.ascore.assert_called_once()
+        metric_b.ascore.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_score_metric_async_passes_only_matching_params(self) -> None:
+        """Metric that only needs user_input + response should not receive retrieved_contexts."""
+        metric = MagicMock(spec=SimpleBaseMetric)
+        metric.name = "selective_metric"
+
+        async def ascore(user_input: str, response: str) -> MetricResult:
+            return MetricResult(value=0.5, reason="ok")
+
+        metric.ascore = ascore
+
+        evaluator = RagasEvaluator(ragas_metrics=[metric])
+        await evaluator.run_async(query="test?", response="answer", documents=["doc"], reference="ref")
+        # Only user_input and response should have been passed — not retrieved_contexts or reference
+        # We wrap ascore to capture kwargs
+        captured = {}
+
+        async def capturing_ascore(user_input: str, response: str) -> MetricResult:
+            captured.update({"user_input": user_input, "response": response})
+            return MetricResult(value=0.5, reason="ok")
+
+        metric.ascore = capturing_ascore
+        await evaluator.run_async(query="test?", response="answer", documents=["doc"], reference="ref")
+        assert set(captured.keys()) == {"user_input", "response"}
+
+    @pytest.mark.asyncio
+    async def test_score_metric_async_omits_none_fields(self) -> None:
+        metric = make_metric_async("faithfulness")
+        evaluator = RagasEvaluator(ragas_metrics=[metric])
+        await evaluator.run_async(query="test?", response="answer")  # no documents → retrieved_contexts=None
+        _, kwargs = metric.ascore.call_args
+        assert "retrieved_contexts" not in kwargs
+
+    @pytest.mark.asyncio
+    async def test_run_async_accepts_document_objects(self) -> None:
+        metric = make_metric_async("faithfulness")
+        evaluator = RagasEvaluator(ragas_metrics=[metric])
+        await evaluator.run_async(
+            query="test?",
+            response="answer",
+            documents=[Document(content="some content"), Document(content="more content")],
+        )
+        _, kwargs = metric.ascore.call_args
+        assert kwargs["retrieved_contexts"] == ["some content", "more content"]
+
+    @pytest.mark.asyncio
+    async def test_run_async_accepts_string_documents(self):
+        metric = make_metric_async("faithfulness")
+        evaluator = RagasEvaluator(ragas_metrics=[metric])
+        await evaluator.run_async(query="test?", response="answer", documents=["doc one", "doc two"])
+        _, kwargs = metric.ascore.call_args
+        assert kwargs["retrieved_contexts"] == ["doc one", "doc two"]
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize(
+        "invalid_input,field_name,error_message",
+        [
+            (["Invalid query type"], "query", "'query' field expected"),
+            ([123, ["Invalid document"]], "documents", "'documents' must be a list"),
+            (["score_1"], "rubrics", "'rubrics' field expected"),
+        ],
+    )
+    async def test_run_async_raises_on_invalid_input_types(self, invalid_input, field_name, error_message):
+        evaluator = RagasEvaluator(ragas_metrics=[make_metric_async("faithfulness")])
+        query = "Which is the most popular global sport?"
+        documents = ["Football is the most popular sport."]
+        response = "Football is the most popular sport in the world"
+
+        with pytest.raises(ValueError) as exc_info:
+            if field_name == "query":
+                await evaluator.run_async(query=invalid_input, documents=documents, response=response)
+            elif field_name == "documents":
+                await evaluator.run_async(query=query, documents=invalid_input, response=response)
+            elif field_name == "rubrics":
+                await evaluator.run_async(query=query, rubrics=invalid_input, documents=documents, response=response)
+
+        assert error_message in str(exc_info.value)
+
+
 class TestSerialization:
     def test_to_dict(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test")