feat(evaluators): add native run_async support to LLMEvaluator (#11581)

GovindhKishore · sjrl · web-flow · commit 4dd018a024fe · 2026-06-15T12:48:41.000+02:00
Co-authored-by: Sebastian Husch Lee &lt;10526848+sjrl@users.noreply.github.com&gt;
diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py
@@ -174,7 +174,39 @@ def run(self, **inputs: Any) -> dict[str, Any]:
                 - `results`: A list of dictionaries with `relevant_statements` and `score` for each input context.
         """
         result = super(ContextRelevanceEvaluator, self).run(**inputs)  # noqa: UP008
+        # Post-process the raw results to calculate relevance metrics and scores
+        return self._postprocess_results(result)
 
+    @component.output_types(score=float, results=list[dict[str, Any]])
+    async def run_async(self, **inputs: Any) -> dict[str, Any]:
+        """
+        Run the LLM evaluator asynchronously.
+
+        :param questions:
+            A list of questions.
+        :param contexts:
+            A list of lists of contexts. Each list of contexts corresponds to one question.
+        :returns:
+            A dictionary with the following outputs:
+                - `score`: Mean context relevance score over all the provided input questions.
+                - `results`: A list of dictionaries with `relevant_statements` and `score` for each input context.
+        """
+        result = await super(ContextRelevanceEvaluator, self).run_async(**inputs)  # noqa: UP008
+        # Post-process the raw results to calculate relevance metrics and scores
+        return self._postprocess_results(result)
+
+    def _postprocess_results(self, result: dict[str, Any]) -> dict[str, Any]:
+        """
+        Post-processes raw LLM evaluator outputs to compute context relevance scores.
+
+        Calculates binary scores based on whether relevant statements were found,
+        averages the scores across all successful queries, and updates the result payload.
+
+        :param result:
+            The raw evaluation dictionary from the base LLM evaluator.
+        :returns:
+            The updated dictionary containing final scores and tracking metrics.
+        """
         for idx, res in enumerate(result["results"]):
             if res is None:
                 result["results"][idx] = {"relevant_statements": [], "score": float("nan")}
diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py
@@ -149,7 +149,7 @@ def __init__(
             progress_bar=progress_bar,
         )
 
-    @component.output_types(individual_scores=list[int], score=float, results=list[dict[str, Any]])
+    @component.output_types(individual_scores=list[float], score=float, results=list[dict[str, Any]])
     def run(self, **inputs: Any) -> dict[str, Any]:
         """
         Run the LLM evaluator.
@@ -167,6 +167,42 @@ def run(self, **inputs: Any) -> dict[str, Any]:
                 - `results`: A list of dictionaries with `statements` and `statement_scores` for each input answer.
         """
         result = super(FaithfulnessEvaluator, self).run(**inputs)  # noqa: UP008
+        # Post-process the raw results to calculate relevance metrics and scores
+        return self._postprocess_results(result)
+
+    @component.output_types(individual_scores=list[float], score=float, results=list[dict[str, Any]])
+    async def run_async(self, **inputs: Any) -> dict[str, Any]:
+        """
+        Run the LLM evaluator asynchronously.
+
+        :param questions:
+            A list of questions.
+        :param contexts:
+            A nested list of contexts that correspond to the questions.
+        :param predicted_answers:
+            A list of predicted answers.
+        :returns:
+            A dictionary with the following outputs:
+                - `score`: Mean faithfulness score over all the provided input answers.
+                - `individual_scores`: A list of faithfulness scores for each input answer.
+                - `results`: A list of dictionaries with `statements` and `statement_scores` for each input answer.
+        """
+        result = await super(FaithfulnessEvaluator, self).run_async(**inputs)  # noqa: UP008
+        # Post-process the raw results to calculate relevance metrics and scores
+        return self._postprocess_results(result)
+
+    def _postprocess_results(self, result: dict[str, Any]) -> dict[str, Any]:
+        """
+        Post-processes raw LLM evaluator outputs to compute faithfulness scores.
+
+        Calculates statement-level score averages, computes the overall mean faithfulness
+        score across successful queries, and updates the result payload.
+
+        :param result:
+            The raw evaluation dictionary from the base LLM evaluator.
+        :returns:
+            The updated dictionary containing final scores and tracking metrics.
+        """
 
         # calculate average statement faithfulness score per query
         for idx, res in enumerate(result["results"]):
diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
@@ -2,10 +2,12 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import asyncio
 import json
 from typing import Any
 
 from tqdm import tqdm
+from tqdm.asyncio import tqdm as async_tqdm
 
 from haystack import component, default_from_dict, default_to_dict, logging
 from haystack.components.builders import PromptBuilder
@@ -240,6 +242,84 @@ def run(self, **inputs: Any) -> dict[str, Any]:
 
         return {"results": results, "meta": metadata or None}
 
+    @component.output_types(results=list[dict[str, Any]])
+    async def run_async(self, **inputs: Any) -> dict[str, Any]:
+        """
+        Run the LLM evaluator asynchronously
+
+        :param inputs:
+            The input values to evaluate. The keys are the input names and the values are lists of input values.
+        :returns:
+            A dictionary with a `results` entry that contains a list of results.
+            Each result is a dictionary containing the keys as defined in the `outputs` parameter of the LLMEvaluator
+            and the evaluation results as the values. If an exception occurs for a particular input value, the result
+            will be `None` for that entry.
+            If the API is "openai" and the response contains a "meta" key, the metadata from OpenAI will be included
+            in the output dictionary, under the key "meta".
+        :raises TypeError:
+            If the chat generator does not support async execution.
+        :raises ValueError:
+            Only in the case that  `raise_on_failure` is set to True and the received inputs are not lists or have
+            different lengths, or if the output is not a valid JSON or doesn't contain the expected keys.
+        """
+
+        if not self._is_warmed_up:
+            self.warm_up()
+
+        self.validate_input_parameters(dict(self.inputs), inputs)
+
+        # inputs is a dictionary with keys being input names and values being a list of input values
+        # We need to iterate through the lists in parallel for all keys of the dictionary
+        input_names, values = inputs.keys(), list(zip(*inputs.values(), strict=True))
+        list_of_input_names_to_values = [dict(zip(input_names, v, strict=True)) for v in values]
+
+        results: list[dict[str, Any] | None] = []
+        metadata = []
+        errors = 0
+
+        generator_has_async = hasattr(self._chat_generator, "run_async")
+        for input_names_to_values in async_tqdm(list_of_input_names_to_values, disable=not self.progress_bar):
+            prompt = self.builder.run(**input_names_to_values)
+            messages = [ChatMessage.from_user(prompt["prompt"])]
+            try:
+                if generator_has_async:
+                    result = await self._chat_generator.run_async(messages=messages)  # type: ignore[attr-defined]
+                else:
+                    logger.debug(
+                        "{generator_type} does not implement 'run_async'."
+                        " Running the synchronous 'run' method in a thread to avoid blocking the event loop.",
+                        generator_type=type(self._chat_generator).__name__,
+                    )
+                    result = await asyncio.to_thread(self._chat_generator.run, messages=messages)
+            except Exception as e:
+                if self.raise_on_failure:
+                    raise ValueError(f"Error while generating response for prompt: {prompt}. Error: {e}") from e
+                logger.warning("Error while generating response for prompt: {prompt}. Error: {e}", prompt=prompt, e=e)
+                results.append(None)
+                errors += 1
+                continue
+
+            parsed_result = _parse_dict_from_json(
+                result["replies"][0].text, expected_keys=self.outputs, raise_on_failure=self.raise_on_failure
+            )
+            if parsed_result is None:
+                results.append(None)
+                errors += 1
+            else:
+                results.append(parsed_result)
+
+            if result["replies"][0].meta:
+                metadata.append(result["replies"][0].meta)
+
+        if errors > 0:
+            logger.warning(
+                "LLM evaluator failed for {errors} out of {len(list_of_input_names_to_values)} inputs.",
+                errors=errors,
+                len=len(list_of_input_names_to_values),
+            )
+
+        return {"results": results, "meta": metadata or None}
+
     def prepare_template(self) -> str:
         """
         Prepare the prompt template.
diff --git a/releasenotes/notes/feat-llm-evaluator-run-async-3d7fb1d0991c23ce.yaml b/releasenotes/notes/feat-llm-evaluator-run-async-3d7fb1d0991c23ce.yaml
@@ -0,0 +1,4 @@
+---
+features:
+  - |
+    Added native asynchronous support (``run_async``) to ``LLMEvaluator``, ``FaithfulnessEvaluator``, and ``ContextRelevanceEvaluator``. This allows concurrent evaluation loops inside async applications like FastMCP or FastAPI without blocking the main event loop, while automatically falling back to thread workers for synchronous chat generators.
diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py
@@ -262,3 +262,84 @@ def test_live_run(self):
         assert "prompt_tokens" in result["meta"][0]["usage"]
         assert "completion_tokens" in result["meta"][0]["usage"]
         assert "total_tokens" in result["meta"][0]["usage"]
+
+
+class TestContextRelevanceEvaluatorAsync:
+    @pytest.mark.asyncio
+    async def test_run_async_calculates_mean_score(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = ContextRelevanceEvaluator()
+
+        async def chat_generator_run_async(self, *args, **kwargs):
+            if "Football" in kwargs["messages"][0].text:
+                return {"replies": [ChatMessage.from_assistant('{"relevant_statements": ["a", "b"], "score": 1}')]}
+            return {"replies": [ChatMessage.from_assistant('{"relevant_statements": [], "score": 0}')]}
+
+        monkeypatch.setattr(
+            "haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run_async", chat_generator_run_async
+        )
+
+        questions = ["Which is the most popular global sport?", "Who created the Python language?"]
+        contexts = [
+            ["Football is the world's most popular sport."],
+            ["Python is a cross-platform programming language."],
+        ]
+
+        results = await component.run_async(questions=questions, contexts=contexts)
+
+        assert results == {
+            "results": [{"score": 1, "relevant_statements": ["a", "b"]}, {"score": 0, "relevant_statements": []}],
+            "score": 0.5,
+            "meta": None,
+            "individual_scores": [1, 0],
+        }
+
+    @pytest.mark.asyncio
+    async def test_run_async_returns_nan_raise_on_failure_false(self, monkeypatch, caplog):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = ContextRelevanceEvaluator(raise_on_failure=False)
+
+        async def chat_generator_run_async(self, *args, **kwargs):
+            if "Python" in kwargs["messages"][0].text:
+                raise Exception("OpenAI API request failed.")
+            return {"replies": [ChatMessage.from_assistant('{"relevant_statements": ["c", "d"], "score": 1}')]}
+
+        monkeypatch.setattr(
+            "haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run_async", chat_generator_run_async
+        )
+
+        questions = ["Which is the most popular global sport?", "Who created the Python language?"]
+        contexts = [["Football is popular."], ["Python was created by Guido van Rossum."]]
+
+        with caplog.at_level("WARNING", logger="haystack.components.evaluators.context_relevance"):
+            results = await component.run_async(questions=questions, contexts=contexts)
+
+        assert results["score"] == 1
+        assert results["results"][0] == {"relevant_statements": ["c", "d"], "score": 1}
+        assert results["results"][1]["relevant_statements"] == []
+        assert math.isnan(results["results"][1]["score"])
+
+        assert "1 query(s) failed and were excluded from the score." in caplog.text
+
+    @pytest.mark.asyncio
+    @pytest.mark.skipif(
+        not os.environ.get("OPENAI_API_KEY", None),
+        reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
+    )
+    @pytest.mark.integration
+    async def test_live_run_async(self):
+        questions = ["Who created the Python language?"]
+        contexts = [["Python, created by Guido van Rossum, is a high-level general-purpose programming language."]]
+
+        evaluator = ContextRelevanceEvaluator(chat_generator=OpenAIChatGenerator(model="gpt-4.1-nano"))
+        result = await evaluator.run_async(questions=questions, contexts=contexts)
+
+        required_fields = {"results"}
+        assert all(field in result for field in required_fields)
+        nested_required_fields = {"score", "relevant_statements"}
+        assert all(field in result["results"][0] for field in nested_required_fields)
+
+        assert "meta" in result
+        assert "prompt_tokens" in result["meta"][0]["usage"]
+        assert "completion_tokens" in result["meta"][0]["usage"]
+        assert "total_tokens" in result["meta"][0]["usage"]
diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py
@@ -320,3 +320,87 @@ def test_live_run(self):
         assert "prompt_tokens" in result["meta"][0]["usage"]
         assert "completion_tokens" in result["meta"][0]["usage"]
         assert "total_tokens" in result["meta"][0]["usage"]
+
+
+class TestFaithfulnessEvaluatorAsync:
+    @pytest.mark.asyncio
+    async def test_run_async_calculates_mean_score(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = FaithfulnessEvaluator()
+
+        async def chat_generator_run_async(self, *args, **kwargs):
+            if "Football" in kwargs["messages"][0].text:
+                return {
+                    "replies": [ChatMessage.from_assistant('{"statements": ["a", "b"], "statement_scores": [1, 0]}')]
+                }
+            return {"replies": [ChatMessage.from_assistant('{"statements": ["c", "d"], "statement_scores": [1, 1]}')]}
+
+        monkeypatch.setattr(
+            "haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run_async", chat_generator_run_async
+        )
+
+        questions = ["Which is the most popular global sport?", "Who created the Python language?"]
+        contexts = [["Football is the world's most popular sport."], ["Python was created by Guido van Rossum."]]
+        predicted_answers = ["Football is the most popular sport.", "Python is a language created by George Lucas."]
+        results = await component.run_async(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
+        assert results == {
+            "individual_scores": [0.5, 1.0],
+            "results": [
+                {"score": 0.5, "statement_scores": [1, 0], "statements": ["a", "b"]},
+                {"score": 1.0, "statement_scores": [1, 1], "statements": ["c", "d"]},
+            ],
+            "score": 0.75,
+            "meta": None,
+        }
+
+    @pytest.mark.asyncio
+    async def test_run_async_returns_nan_raise_on_failure_false(self, monkeypatch, caplog):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = FaithfulnessEvaluator(raise_on_failure=False)
+
+        async def chat_generator_run_async(self, *args, **kwargs):
+            if "Python" in kwargs["messages"][0].text:
+                raise Exception("OpenAI API request failed.")
+            return {"replies": [ChatMessage.from_assistant('{"statements": ["c", "d"], "statement_scores": [1, 1]}')]}
+
+        monkeypatch.setattr(
+            "haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run_async", chat_generator_run_async
+        )
+
+        questions = ["Which is the most popular global sport?", "Who created the Python language?"]
+        contexts = [["Football is popular."], ["Python was created by Guido."]]
+        predicted_answers = ["Football is popular.", "Guido van Rossum."]
+
+        with caplog.at_level("WARNING", logger="haystack.components.evaluators.faithfulness"):
+            results = await component.run_async(
+                questions=questions, contexts=contexts, predicted_answers=predicted_answers
+            )
+
+        assert results["score"] == 1.0
+        assert results["individual_scores"][0] == 1.0
+        assert math.isnan(results["individual_scores"][1])
+        assert "1 query(s) failed and were excluded from the score." in caplog.text
+
+    @pytest.mark.asyncio
+    @pytest.mark.skipif(
+        not os.environ.get("OPENAI_API_KEY", None),
+        reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
+    )
+    @pytest.mark.integration
+    async def test_live_run_async(self):
+        questions = ["What is Python and who created it?"]
+        contexts = [["Python is a programming language created by Guido van Rossum."]]
+        predicted_answers = ["Python is a programming language created by George Lucas."]
+        evaluator = FaithfulnessEvaluator(chat_generator=OpenAIChatGenerator(model="gpt-4.1-nano"))
+        result = await evaluator.run_async(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
+
+        required_fields = {"individual_scores", "results", "score"}
+        assert all(field in result for field in required_fields)
+        nested_required_fields = {"score", "statement_scores", "statements"}
+        assert all(field in result["results"][0] for field in nested_required_fields)
+
+        # assert that metadata is present in the result
+        assert "meta" in result
+        assert "prompt_tokens" in result["meta"][0]["usage"]
+        assert "completion_tokens" in result["meta"][0]["usage"]
+        assert "total_tokens" in result["meta"][0]["usage"]
diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +---
 +features:
 +  - |
 +    Added native asynchronous support (``run_async``) to ``LLMEvaluator``, ``FaithfulnessEvaluator``, and ``ContextRelevanceEvaluator``. This allows concurrent evaluation loops inside async applications like FastMCP or FastAPI without blocking the main event loop, while automatically falling back to thread workers for synchronous chat generators.