From 7bbf960ff40afd12e62532807f2367bc98f86446 Mon Sep 17 00:00:00 2001
From: Aftabbs <aftabbs.wwe@gmail.com>
Date: Thu, 4 Jun 2026 16:11:53 +0530
Subject: [PATCH] fix(evaluators): exclude NaN from aggregate score in
 FaithfulnessEvaluator and ContextRelevanceEvaluator

When raise_on_failure=False and one or more LLM calls fail, np_mean/mean
over a list containing NaN silently returns NaN for the aggregate score.
This means a single failed query poisons the whole batch result with no
warning, breaking any downstream code that compares or reports the score.

Filter out NaN entries before computing the mean so failed queries are
excluded from the aggregate. Log a warning with the count of skipped
queries. If all queries fail the aggregate remains NaN (unchanged).
Individual scores in individual_scores and results are preserved as NaN
for per-query transparency.

Fixes #11383
---
 .../evaluators/context_relevance.py           | 17 ++++++++++---
 .../components/evaluators/faithfulness.py     | 18 +++++++++++---
 ...-nan-aggregate-score-7c3a1b9d2e4f8a01.yaml |  8 +++++++
 .../test_context_relevance_evaluator.py       | 21 +++++++++++++++-
 .../evaluators/test_faithfulness_evaluator.py | 24 +++++++++++++++++--
 5 files changed, 79 insertions(+), 9 deletions(-)
 create mode 100644 releasenotes/notes/fix-evaluator-nan-aggregate-score-7c3a1b9d2e4f8a01.yaml

diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py
index f5db0655e4..a59edf485e 100644
--- a/haystack/components/evaluators/context_relevance.py
+++ b/haystack/components/evaluators/context_relevance.py
@@ -2,15 +2,18 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import math
 from statistics import mean
 from typing import Any
 
-from haystack import component, default_from_dict, default_to_dict
+from haystack import component, default_from_dict, default_to_dict, logging
 from haystack.components.evaluators.llm_evaluator import LLMEvaluator
 from haystack.components.generators.chat.types import ChatGenerator
 from haystack.core.serialization import component_to_dict
 from haystack.utils import deserialize_chatgenerator_inplace
 
+logger = logging.getLogger(__name__)
+
 # Private global variable for default examples to include in the prompt if the user does not provide any examples
 _DEFAULT_EXAMPLES = [
     {
@@ -181,8 +184,16 @@ def run(self, **inputs: Any) -> dict[str, Any]:
             else:
                 res["score"] = 0
 
-        # calculate average context relevance score over all queries
-        result["score"] = mean([res["score"] for res in result["results"]])
+        # calculate average context relevance score over all queries, excluding failed queries
+        valid_scores = [res["score"] for res in result["results"] if not math.isnan(res["score"])]
+        failed_count = len(result["results"]) - len(valid_scores)
+        if failed_count:
+            logger.warning(
+                "{failed_count} out of {total} queries failed and were excluded from the score.",
+                failed_count=failed_count,
+                total=len(result["results"]),
+            )
+        result["score"] = mean(valid_scores) if valid_scores else float("nan")
         result["individual_scores"] = [res["score"] for res in result["results"]]  # useful for the EvaluationRunResult
 
         return result
diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py
index 456788c1aa..5e252de808 100644
--- a/haystack/components/evaluators/faithfulness.py
+++ b/haystack/components/evaluators/faithfulness.py
@@ -4,14 +4,18 @@
 
 from typing import Any
 
+import math
+
 from numpy import mean as np_mean
 
-from haystack import component, default_from_dict, default_to_dict
+from haystack import component, default_from_dict, default_to_dict, logging
 from haystack.components.evaluators.llm_evaluator import LLMEvaluator
 from haystack.components.generators.chat.types import ChatGenerator
 from haystack.core.serialization import component_to_dict
 from haystack.utils import deserialize_chatgenerator_inplace
 
+logger = logging.getLogger(__name__)
+
 # Default examples to include in the prompt if the user does not provide any examples
 _DEFAULT_EXAMPLES = [
     {
@@ -175,8 +179,16 @@ def run(self, **inputs: Any) -> dict[str, Any]:
             else:
                 res["score"] = np_mean(res["statement_scores"])
 
-        # calculate average answer faithfulness score over all queries
-        result["score"] = np_mean([res["score"] for res in result["results"]])
+        # calculate average answer faithfulness score over all queries, excluding failed queries
+        valid_scores = [res["score"] for res in result["results"] if not math.isnan(res["score"])]
+        failed_count = len(result["results"]) - len(valid_scores)
+        if failed_count:
+            logger.warning(
+                "{failed_count} out of {total} queries failed and were excluded from the score.",
+                failed_count=failed_count,
+                total=len(result["results"]),
+            )
+        result["score"] = np_mean(valid_scores) if valid_scores else float("nan")
         result["individual_scores"] = [res["score"] for res in result["results"]]
 
         return result
diff --git a/releasenotes/notes/fix-evaluator-nan-aggregate-score-7c3a1b9d2e4f8a01.yaml b/releasenotes/notes/fix-evaluator-nan-aggregate-score-7c3a1b9d2e4f8a01.yaml
new file mode 100644
index 0000000000..90dc43a360
--- /dev/null
+++ b/releasenotes/notes/fix-evaluator-nan-aggregate-score-7c3a1b9d2e4f8a01.yaml
@@ -0,0 +1,8 @@
+---
+fixes:
+  - |
+    `FaithfulnessEvaluator` and `ContextRelevanceEvaluator` no longer propagate `NaN` silently into
+    the aggregate `score` when one or more LLM calls fail with `raise_on_failure=False`. Failed
+    queries are now excluded from the mean calculation and a warning is logged indicating how many
+    queries were skipped. If every query fails, the aggregate score remains `NaN`. Individual scores
+    in `individual_scores` and `results` are preserved as `NaN` for per-query transparency.
diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py
index 9eca698879..6ebc7f9896 100644
--- a/test/components/evaluators/test_context_relevance_evaluator.py
+++ b/test/components/evaluators/test_context_relevance_evaluator.py
@@ -233,9 +233,28 @@ def chat_generator_run(self, *args, **kwargs):
         ]
         results = component.run(questions=questions, contexts=contexts)
 
-        assert math.isnan(results["score"])
+        # Valid queries' scores are averaged; failed queries are excluded from the aggregate
+        assert results["score"] == 1.0
         assert results["results"][0] == {"relevant_statements": ["c", "d"], "score": 1}
         assert results["results"][1]["relevant_statements"] == []
+        assert math.isnan(results["results"][1]["score"])  # individual score preserved for transparency
+
+    def test_run_all_failed_returns_nan_score(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = ContextRelevanceEvaluator(raise_on_failure=False)
+
+        def chat_generator_run(self, *args, **kwargs):
+            raise Exception("OpenAI API request failed.")
+
+        monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run)
+
+        questions = ["Who created Python?", "What is the capital of France?"]
+        contexts = [["Python was created by Guido van Rossum."], ["Paris is the capital of France."]]
+        results = component.run(questions=questions, contexts=contexts)
+
+        # All queries failed: aggregate score should still be NaN (not a crash)
+        assert math.isnan(results["score"])
+        assert math.isnan(results["results"][0]["score"])
         assert math.isnan(results["results"][1]["score"])
 
     @pytest.mark.skipif(
diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py
index 64d113462a..8efecd8759 100644
--- a/test/components/evaluators/test_faithfulness_evaluator.py
+++ b/test/components/evaluators/test_faithfulness_evaluator.py
@@ -284,10 +284,11 @@ def chat_generator_run(self, *args, **kwargs):
         ]
         results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
 
-        assert math.isnan(results["score"])
+        # Valid queries' scores are averaged; failed queries are excluded from the aggregate
+        assert results["score"] == 1.0
 
         assert results["individual_scores"][0] == 1.0
-        assert math.isnan(results["individual_scores"][1])
+        assert math.isnan(results["individual_scores"][1])  # individual score preserved as NaN for transparency
 
         assert results["results"][0] == {"statements": ["c", "d"], "statement_scores": [1, 1], "score": 1.0}
 
@@ -295,6 +296,25 @@ def chat_generator_run(self, *args, **kwargs):
         assert results["results"][1]["statement_scores"] == []
         assert math.isnan(results["results"][1]["score"])
 
+    def test_run_all_failed_returns_nan_score(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = FaithfulnessEvaluator(raise_on_failure=False)
+
+        def chat_generator_run(self, *args, **kwargs):
+            raise Exception("OpenAI API request failed.")
+
+        monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run)
+
+        questions = ["Who created Python?", "What is the capital of France?"]
+        contexts = [["Python was created by Guido van Rossum."], ["Paris is the capital of France."]]
+        predicted_answers = ["Guido van Rossum.", "Paris."]
+        results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
+
+        # All queries failed: aggregate score should still be NaN (not a crash)
+        assert math.isnan(results["score"])
+        assert math.isnan(results["individual_scores"][0])
+        assert math.isnan(results["individual_scores"][1])
+
     @pytest.mark.skipif(
         not os.environ.get("OPENAI_API_KEY", None),
         reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",