deepset-ai · Aftabbs · Jun 4, 2026 · Jun 5, 2026
@@ -2,15 +2,18 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import math
 from statistics import mean
 from typing import Any
 
-from haystack import component, default_from_dict, default_to_dict
+from haystack import component, default_from_dict, default_to_dict, logging
 from haystack.components.evaluators.llm_evaluator import LLMEvaluator
 from haystack.components.generators.chat.types import ChatGenerator
 from haystack.core.serialization import component_to_dict
 from haystack.utils import deserialize_chatgenerator_inplace
 
+logger = logging.getLogger(__name__)
+
 # Private global variable for default examples to include in the prompt if the user does not provide any examples
 _DEFAULT_EXAMPLES = [
     {
@@ -181,8 +184,16 @@ def run(self, **inputs: Any) -> dict[str, Any]:
             else:
                 res["score"] = 0
 
-        # calculate average context relevance score over all queries
-        result["score"] = mean([res["score"] for res in result["results"]])
+        # calculate average context relevance score over all queries, excluding failed queries
+        valid_scores = [res["score"] for res in result["results"] if not math.isnan(res["score"])]
+        failed_count = len(result["results"]) - len(valid_scores)
+        if failed_count:
+            logger.warning(
+                "{failed_count} out of {total} queries failed and were excluded from the score.",
+                failed_count=failed_count,
+                total=len(result["results"]),
+            )
+        result["score"] = mean(valid_scores) if valid_scores else float("nan")
         result["individual_scores"] = [res["score"] for res in result["results"]]  # useful for the EvaluationRunResult
 
         return result

@@ -4,14 +4,18 @@
 
 from typing import Any
 
+import math
+
 from numpy import mean as np_mean
 
-from haystack import component, default_from_dict, default_to_dict
+from haystack import component, default_from_dict, default_to_dict, logging
 from haystack.components.evaluators.llm_evaluator import LLMEvaluator
 from haystack.components.generators.chat.types import ChatGenerator
 from haystack.core.serialization import component_to_dict
 from haystack.utils import deserialize_chatgenerator_inplace
 
+logger = logging.getLogger(__name__)
+
 # Default examples to include in the prompt if the user does not provide any examples
 _DEFAULT_EXAMPLES = [
     {
@@ -175,8 +179,16 @@ def run(self, **inputs: Any) -> dict[str, Any]:
             else:
                 res["score"] = np_mean(res["statement_scores"])
 
-        # calculate average answer faithfulness score over all queries
-        result["score"] = np_mean([res["score"] for res in result["results"]])
+        # calculate average answer faithfulness score over all queries, excluding failed queries
+        valid_scores = [res["score"] for res in result["results"] if not math.isnan(res["score"])]
+        failed_count = len(result["results"]) - len(valid_scores)
+        if failed_count:
+            logger.warning(
+                "{failed_count} out of {total} queries failed and were excluded from the score.",
+                failed_count=failed_count,
+                total=len(result["results"]),
+            )
+        result["score"] = np_mean(valid_scores) if valid_scores else float("nan")
         result["individual_scores"] = [res["score"] for res in result["results"]]
 
         return result

@@ -0,0 +1,8 @@
+---
+fixes:
+  - |
+    `FaithfulnessEvaluator` and `ContextRelevanceEvaluator` no longer propagate `NaN` silently into
+    the aggregate `score` when one or more LLM calls fail with `raise_on_failure=False`. Failed
+    queries are now excluded from the mean calculation and a warning is logged indicating how many
+    queries were skipped. If every query fails, the aggregate score remains `NaN`. Individual scores
+    in `individual_scores` and `results` are preserved as `NaN` for per-query transparency.
@@ -233,9 +233,28 @@ def chat_generator_run(self, *args, **kwargs):
         ]
         results = component.run(questions=questions, contexts=contexts)
 
-        assert math.isnan(results["score"])
+        # Valid queries' scores are averaged; failed queries are excluded from the aggregate
+        assert results["score"] == 1.0
         assert results["results"][0] == {"relevant_statements": ["c", "d"], "score": 1}
         assert results["results"][1]["relevant_statements"] == []
+        assert math.isnan(results["results"][1]["score"])  # individual score preserved for transparency
+
+    def test_run_all_failed_returns_nan_score(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = ContextRelevanceEvaluator(raise_on_failure=False)
+
+        def chat_generator_run(self, *args, **kwargs):
+            raise Exception("OpenAI API request failed.")
+
+        monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run)
+
+        questions = ["Who created Python?", "What is the capital of France?"]
+        contexts = [["Python was created by Guido van Rossum."], ["Paris is the capital of France."]]
+        results = component.run(questions=questions, contexts=contexts)
+
+        # All queries failed: aggregate score should still be NaN (not a crash)
+        assert math.isnan(results["score"])
+        assert math.isnan(results["results"][0]["score"])
         assert math.isnan(results["results"][1]["score"])
 
     @pytest.mark.skipif(

@@ -284,17 +284,37 @@ def chat_generator_run(self, *args, **kwargs):
         ]
         results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
 
-        assert math.isnan(results["score"])
+        # Valid queries' scores are averaged; failed queries are excluded from the aggregate
+        assert results["score"] == 1.0
 
         assert results["individual_scores"][0] == 1.0
-        assert math.isnan(results["individual_scores"][1])
+        assert math.isnan(results["individual_scores"][1])  # individual score preserved as NaN for transparency
 
         assert results["results"][0] == {"statements": ["c", "d"], "statement_scores": [1, 1], "score": 1.0}
 
         assert results["results"][1]["statements"] == []
         assert results["results"][1]["statement_scores"] == []
         assert math.isnan(results["results"][1]["score"])
 
+    def test_run_all_failed_returns_nan_score(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = FaithfulnessEvaluator(raise_on_failure=False)
+
+        def chat_generator_run(self, *args, **kwargs):
+            raise Exception("OpenAI API request failed.")
+
+        monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run)
+
+        questions = ["Who created Python?", "What is the capital of France?"]
+        contexts = [["Python was created by Guido van Rossum."], ["Paris is the capital of France."]]
+        predicted_answers = ["Guido van Rossum.", "Paris."]
+        results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
+
+        # All queries failed: aggregate score should still be NaN (not a crash)
+        assert math.isnan(results["score"])
+        assert math.isnan(results["individual_scores"][0])
+        assert math.isnan(results["individual_scores"][1])
+
     @pytest.mark.skipif(
         not os.environ.get("OPENAI_API_KEY", None),
         reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",