diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py index f5db0655e4..a59edf485e 100644 --- a/haystack/components/evaluators/context_relevance.py +++ b/haystack/components/evaluators/context_relevance.py @@ -2,15 +2,18 @@ # # SPDX-License-Identifier: Apache-2.0 +import math from statistics import mean from typing import Any -from haystack import component, default_from_dict, default_to_dict +from haystack import component, default_from_dict, default_to_dict, logging from haystack.components.evaluators.llm_evaluator import LLMEvaluator from haystack.components.generators.chat.types import ChatGenerator from haystack.core.serialization import component_to_dict from haystack.utils import deserialize_chatgenerator_inplace +logger = logging.getLogger(__name__) + # Private global variable for default examples to include in the prompt if the user does not provide any examples _DEFAULT_EXAMPLES = [ { @@ -181,8 +184,16 @@ def run(self, **inputs: Any) -> dict[str, Any]: else: res["score"] = 0 - # calculate average context relevance score over all queries - result["score"] = mean([res["score"] for res in result["results"]]) + # calculate average context relevance score over all queries, excluding failed queries + valid_scores = [res["score"] for res in result["results"] if not math.isnan(res["score"])] + failed_count = len(result["results"]) - len(valid_scores) + if failed_count: + logger.warning( + "{failed_count} out of {total} queries failed and were excluded from the score.", + failed_count=failed_count, + total=len(result["results"]), + ) + result["score"] = mean(valid_scores) if valid_scores else float("nan") result["individual_scores"] = [res["score"] for res in result["results"]] # useful for the EvaluationRunResult return result diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py index 456788c1aa..5e252de808 100644 --- a/haystack/components/evaluators/faithfulness.py +++ b/haystack/components/evaluators/faithfulness.py @@ -4,14 +4,18 @@ from typing import Any +import math + from numpy import mean as np_mean -from haystack import component, default_from_dict, default_to_dict +from haystack import component, default_from_dict, default_to_dict, logging from haystack.components.evaluators.llm_evaluator import LLMEvaluator from haystack.components.generators.chat.types import ChatGenerator from haystack.core.serialization import component_to_dict from haystack.utils import deserialize_chatgenerator_inplace +logger = logging.getLogger(__name__) + # Default examples to include in the prompt if the user does not provide any examples _DEFAULT_EXAMPLES = [ { @@ -175,8 +179,16 @@ def run(self, **inputs: Any) -> dict[str, Any]: else: res["score"] = np_mean(res["statement_scores"]) - # calculate average answer faithfulness score over all queries - result["score"] = np_mean([res["score"] for res in result["results"]]) + # calculate average answer faithfulness score over all queries, excluding failed queries + valid_scores = [res["score"] for res in result["results"] if not math.isnan(res["score"])] + failed_count = len(result["results"]) - len(valid_scores) + if failed_count: + logger.warning( + "{failed_count} out of {total} queries failed and were excluded from the score.", + failed_count=failed_count, + total=len(result["results"]), + ) + result["score"] = np_mean(valid_scores) if valid_scores else float("nan") result["individual_scores"] = [res["score"] for res in result["results"]] return result diff --git a/releasenotes/notes/fix-evaluator-nan-aggregate-score-7c3a1b9d2e4f8a01.yaml b/releasenotes/notes/fix-evaluator-nan-aggregate-score-7c3a1b9d2e4f8a01.yaml new file mode 100644 index 0000000000..90dc43a360 --- /dev/null +++ b/releasenotes/notes/fix-evaluator-nan-aggregate-score-7c3a1b9d2e4f8a01.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + `FaithfulnessEvaluator` and `ContextRelevanceEvaluator` no longer propagate `NaN` silently into + the aggregate `score` when one or more LLM calls fail with `raise_on_failure=False`. Failed + queries are now excluded from the mean calculation and a warning is logged indicating how many + queries were skipped. If every query fails, the aggregate score remains `NaN`. Individual scores + in `individual_scores` and `results` are preserved as `NaN` for per-query transparency. diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py index 9eca698879..6ebc7f9896 100644 --- a/test/components/evaluators/test_context_relevance_evaluator.py +++ b/test/components/evaluators/test_context_relevance_evaluator.py @@ -233,9 +233,28 @@ def chat_generator_run(self, *args, **kwargs): ] results = component.run(questions=questions, contexts=contexts) - assert math.isnan(results["score"]) + # Valid queries' scores are averaged; failed queries are excluded from the aggregate + assert results["score"] == 1.0 assert results["results"][0] == {"relevant_statements": ["c", "d"], "score": 1} assert results["results"][1]["relevant_statements"] == [] + assert math.isnan(results["results"][1]["score"]) # individual score preserved for transparency + + def test_run_all_failed_returns_nan_score(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = ContextRelevanceEvaluator(raise_on_failure=False) + + def chat_generator_run(self, *args, **kwargs): + raise Exception("OpenAI API request failed.") + + monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run) + + questions = ["Who created Python?", "What is the capital of France?"] + contexts = [["Python was created by Guido van Rossum."], ["Paris is the capital of France."]] + results = component.run(questions=questions, contexts=contexts) + + # All queries failed: aggregate score should still be NaN (not a crash) + assert math.isnan(results["score"]) + assert math.isnan(results["results"][0]["score"]) assert math.isnan(results["results"][1]["score"]) @pytest.mark.skipif( diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py index 64d113462a..8efecd8759 100644 --- a/test/components/evaluators/test_faithfulness_evaluator.py +++ b/test/components/evaluators/test_faithfulness_evaluator.py @@ -284,10 +284,11 @@ def chat_generator_run(self, *args, **kwargs): ] results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers) - assert math.isnan(results["score"]) + # Valid queries' scores are averaged; failed queries are excluded from the aggregate + assert results["score"] == 1.0 assert results["individual_scores"][0] == 1.0 - assert math.isnan(results["individual_scores"][1]) + assert math.isnan(results["individual_scores"][1]) # individual score preserved as NaN for transparency assert results["results"][0] == {"statements": ["c", "d"], "statement_scores": [1, 1], "score": 1.0} @@ -295,6 +296,25 @@ def chat_generator_run(self, *args, **kwargs): assert results["results"][1]["statement_scores"] == [] assert math.isnan(results["results"][1]["score"]) + def test_run_all_failed_returns_nan_score(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = FaithfulnessEvaluator(raise_on_failure=False) + + def chat_generator_run(self, *args, **kwargs): + raise Exception("OpenAI API request failed.") + + monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run) + + questions = ["Who created Python?", "What is the capital of France?"] + contexts = [["Python was created by Guido van Rossum."], ["Paris is the capital of France."]] + predicted_answers = ["Guido van Rossum.", "Paris."] + results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers) + + # All queries failed: aggregate score should still be NaN (not a crash) + assert math.isnan(results["score"]) + assert math.isnan(results["individual_scores"][0]) + assert math.isnan(results["individual_scores"][1]) + @pytest.mark.skipif( not os.environ.get("OPENAI_API_KEY", None), reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",