From 7bbf960ff40afd12e62532807f2367bc98f86446 Mon Sep 17 00:00:00 2001 From: Aftabbs Date: Thu, 4 Jun 2026 16:11:53 +0530 Subject: [PATCH] fix(evaluators): exclude NaN from aggregate score in FaithfulnessEvaluator and ContextRelevanceEvaluator When raise_on_failure=False and one or more LLM calls fail, np_mean/mean over a list containing NaN silently returns NaN for the aggregate score. This means a single failed query poisons the whole batch result with no warning, breaking any downstream code that compares or reports the score. Filter out NaN entries before computing the mean so failed queries are excluded from the aggregate. Log a warning with the count of skipped queries. If all queries fail the aggregate remains NaN (unchanged). Individual scores in individual_scores and results are preserved as NaN for per-query transparency. Fixes #11383 --- .../evaluators/context_relevance.py | 17 ++++++++++--- .../components/evaluators/faithfulness.py | 18 +++++++++++--- ...-nan-aggregate-score-7c3a1b9d2e4f8a01.yaml | 8 +++++++ .../test_context_relevance_evaluator.py | 21 +++++++++++++++- .../evaluators/test_faithfulness_evaluator.py | 24 +++++++++++++++++-- 5 files changed, 79 insertions(+), 9 deletions(-) create mode 100644 releasenotes/notes/fix-evaluator-nan-aggregate-score-7c3a1b9d2e4f8a01.yaml diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py index f5db0655e4..a59edf485e 100644 --- a/haystack/components/evaluators/context_relevance.py +++ b/haystack/components/evaluators/context_relevance.py @@ -2,15 +2,18 @@ # # SPDX-License-Identifier: Apache-2.0 +import math from statistics import mean from typing import Any -from haystack import component, default_from_dict, default_to_dict +from haystack import component, default_from_dict, default_to_dict, logging from haystack.components.evaluators.llm_evaluator import LLMEvaluator from haystack.components.generators.chat.types import ChatGenerator from haystack.core.serialization import component_to_dict from haystack.utils import deserialize_chatgenerator_inplace +logger = logging.getLogger(__name__) + # Private global variable for default examples to include in the prompt if the user does not provide any examples _DEFAULT_EXAMPLES = [ { @@ -181,8 +184,16 @@ def run(self, **inputs: Any) -> dict[str, Any]: else: res["score"] = 0 - # calculate average context relevance score over all queries - result["score"] = mean([res["score"] for res in result["results"]]) + # calculate average context relevance score over all queries, excluding failed queries + valid_scores = [res["score"] for res in result["results"] if not math.isnan(res["score"])] + failed_count = len(result["results"]) - len(valid_scores) + if failed_count: + logger.warning( + "{failed_count} out of {total} queries failed and were excluded from the score.", + failed_count=failed_count, + total=len(result["results"]), + ) + result["score"] = mean(valid_scores) if valid_scores else float("nan") result["individual_scores"] = [res["score"] for res in result["results"]] # useful for the EvaluationRunResult return result diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py index 456788c1aa..5e252de808 100644 --- a/haystack/components/evaluators/faithfulness.py +++ b/haystack/components/evaluators/faithfulness.py @@ -4,14 +4,18 @@ from typing import Any +import math + from numpy import mean as np_mean -from haystack import component, default_from_dict, default_to_dict +from haystack import component, default_from_dict, default_to_dict, logging from haystack.components.evaluators.llm_evaluator import LLMEvaluator from haystack.components.generators.chat.types import ChatGenerator from haystack.core.serialization import component_to_dict from haystack.utils import deserialize_chatgenerator_inplace +logger = logging.getLogger(__name__) + # Default examples to include in the prompt if the user does not provide any examples _DEFAULT_EXAMPLES = [ { @@ -175,8 +179,16 @@ def run(self, **inputs: Any) -> dict[str, Any]: else: res["score"] = np_mean(res["statement_scores"]) - # calculate average answer faithfulness score over all queries - result["score"] = np_mean([res["score"] for res in result["results"]]) + # calculate average answer faithfulness score over all queries, excluding failed queries + valid_scores = [res["score"] for res in result["results"] if not math.isnan(res["score"])] + failed_count = len(result["results"]) - len(valid_scores) + if failed_count: + logger.warning( + "{failed_count} out of {total} queries failed and were excluded from the score.", + failed_count=failed_count, + total=len(result["results"]), + ) + result["score"] = np_mean(valid_scores) if valid_scores else float("nan") result["individual_scores"] = [res["score"] for res in result["results"]] return result diff --git a/releasenotes/notes/fix-evaluator-nan-aggregate-score-7c3a1b9d2e4f8a01.yaml b/releasenotes/notes/fix-evaluator-nan-aggregate-score-7c3a1b9d2e4f8a01.yaml new file mode 100644 index 0000000000..90dc43a360 --- /dev/null +++ b/releasenotes/notes/fix-evaluator-nan-aggregate-score-7c3a1b9d2e4f8a01.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + `FaithfulnessEvaluator` and `ContextRelevanceEvaluator` no longer propagate `NaN` silently into + the aggregate `score` when one or more LLM calls fail with `raise_on_failure=False`. Failed + queries are now excluded from the mean calculation and a warning is logged indicating how many + queries were skipped. If every query fails, the aggregate score remains `NaN`. Individual scores + in `individual_scores` and `results` are preserved as `NaN` for per-query transparency. diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py index 9eca698879..6ebc7f9896 100644 --- a/test/components/evaluators/test_context_relevance_evaluator.py +++ b/test/components/evaluators/test_context_relevance_evaluator.py @@ -233,9 +233,28 @@ def chat_generator_run(self, *args, **kwargs): ] results = component.run(questions=questions, contexts=contexts) - assert math.isnan(results["score"]) + # Valid queries' scores are averaged; failed queries are excluded from the aggregate + assert results["score"] == 1.0 assert results["results"][0] == {"relevant_statements": ["c", "d"], "score": 1} assert results["results"][1]["relevant_statements"] == [] + assert math.isnan(results["results"][1]["score"]) # individual score preserved for transparency + + def test_run_all_failed_returns_nan_score(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = ContextRelevanceEvaluator(raise_on_failure=False) + + def chat_generator_run(self, *args, **kwargs): + raise Exception("OpenAI API request failed.") + + monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run) + + questions = ["Who created Python?", "What is the capital of France?"] + contexts = [["Python was created by Guido van Rossum."], ["Paris is the capital of France."]] + results = component.run(questions=questions, contexts=contexts) + + # All queries failed: aggregate score should still be NaN (not a crash) + assert math.isnan(results["score"]) + assert math.isnan(results["results"][0]["score"]) assert math.isnan(results["results"][1]["score"]) @pytest.mark.skipif( diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py index 64d113462a..8efecd8759 100644 --- a/test/components/evaluators/test_faithfulness_evaluator.py +++ b/test/components/evaluators/test_faithfulness_evaluator.py @@ -284,10 +284,11 @@ def chat_generator_run(self, *args, **kwargs): ] results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers) - assert math.isnan(results["score"]) + # Valid queries' scores are averaged; failed queries are excluded from the aggregate + assert results["score"] == 1.0 assert results["individual_scores"][0] == 1.0 - assert math.isnan(results["individual_scores"][1]) + assert math.isnan(results["individual_scores"][1]) # individual score preserved as NaN for transparency assert results["results"][0] == {"statements": ["c", "d"], "statement_scores": [1, 1], "score": 1.0} @@ -295,6 +296,25 @@ def chat_generator_run(self, *args, **kwargs): assert results["results"][1]["statement_scores"] == [] assert math.isnan(results["results"][1]["score"]) + def test_run_all_failed_returns_nan_score(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = FaithfulnessEvaluator(raise_on_failure=False) + + def chat_generator_run(self, *args, **kwargs): + raise Exception("OpenAI API request failed.") + + monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run) + + questions = ["Who created Python?", "What is the capital of France?"] + contexts = [["Python was created by Guido van Rossum."], ["Paris is the capital of France."]] + predicted_answers = ["Guido van Rossum.", "Paris."] + results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers) + + # All queries failed: aggregate score should still be NaN (not a crash) + assert math.isnan(results["score"]) + assert math.isnan(results["individual_scores"][0]) + assert math.isnan(results["individual_scores"][1]) + @pytest.mark.skipif( not os.environ.get("OPENAI_API_KEY", None), reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",