Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions haystack/components/evaluators/context_relevance.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@
#
# SPDX-License-Identifier: Apache-2.0

import math
from statistics import mean
from typing import Any

from haystack import component, default_from_dict, default_to_dict
from haystack import component, default_from_dict, default_to_dict, logging
from haystack.components.evaluators.llm_evaluator import LLMEvaluator
from haystack.components.generators.chat.types import ChatGenerator
from haystack.core.serialization import component_to_dict
from haystack.utils import deserialize_chatgenerator_inplace

logger = logging.getLogger(__name__)

# Private global variable for default examples to include in the prompt if the user does not provide any examples
_DEFAULT_EXAMPLES = [
{
Expand Down Expand Up @@ -181,8 +184,16 @@ def run(self, **inputs: Any) -> dict[str, Any]:
else:
res["score"] = 0

# calculate average context relevance score over all queries
result["score"] = mean([res["score"] for res in result["results"]])
# calculate average context relevance score over all queries, excluding failed queries
valid_scores = [res["score"] for res in result["results"] if not math.isnan(res["score"])]
failed_count = len(result["results"]) - len(valid_scores)
if failed_count:
logger.warning(
"{failed_count} out of {total} queries failed and were excluded from the score.",
failed_count=failed_count,
total=len(result["results"]),
)
result["score"] = mean(valid_scores) if valid_scores else float("nan")
result["individual_scores"] = [res["score"] for res in result["results"]] # useful for the EvaluationRunResult

return result
Expand Down
18 changes: 15 additions & 3 deletions haystack/components/evaluators/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,18 @@

from typing import Any

import math

from numpy import mean as np_mean

from haystack import component, default_from_dict, default_to_dict
from haystack import component, default_from_dict, default_to_dict, logging
from haystack.components.evaluators.llm_evaluator import LLMEvaluator
from haystack.components.generators.chat.types import ChatGenerator
from haystack.core.serialization import component_to_dict
from haystack.utils import deserialize_chatgenerator_inplace

logger = logging.getLogger(__name__)

# Default examples to include in the prompt if the user does not provide any examples
_DEFAULT_EXAMPLES = [
{
Expand Down Expand Up @@ -175,8 +179,16 @@ def run(self, **inputs: Any) -> dict[str, Any]:
else:
res["score"] = np_mean(res["statement_scores"])

# calculate average answer faithfulness score over all queries
result["score"] = np_mean([res["score"] for res in result["results"]])
# calculate average answer faithfulness score over all queries, excluding failed queries
valid_scores = [res["score"] for res in result["results"] if not math.isnan(res["score"])]
failed_count = len(result["results"]) - len(valid_scores)
if failed_count:
logger.warning(
"{failed_count} out of {total} queries failed and were excluded from the score.",
failed_count=failed_count,
total=len(result["results"]),
)
result["score"] = np_mean(valid_scores) if valid_scores else float("nan")
result["individual_scores"] = [res["score"] for res in result["results"]]

return result
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
fixes:
- |
`FaithfulnessEvaluator` and `ContextRelevanceEvaluator` no longer propagate `NaN` silently into
the aggregate `score` when one or more LLM calls fail with `raise_on_failure=False`. Failed
queries are now excluded from the mean calculation and a warning is logged indicating how many
queries were skipped. If every query fails, the aggregate score remains `NaN`. Individual scores
in `individual_scores` and `results` are preserved as `NaN` for per-query transparency.
21 changes: 20 additions & 1 deletion test/components/evaluators/test_context_relevance_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,9 +233,28 @@ def chat_generator_run(self, *args, **kwargs):
]
results = component.run(questions=questions, contexts=contexts)

assert math.isnan(results["score"])
# Valid queries' scores are averaged; failed queries are excluded from the aggregate
assert results["score"] == 1.0
assert results["results"][0] == {"relevant_statements": ["c", "d"], "score": 1}
assert results["results"][1]["relevant_statements"] == []
assert math.isnan(results["results"][1]["score"]) # individual score preserved for transparency

def test_run_all_failed_returns_nan_score(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = ContextRelevanceEvaluator(raise_on_failure=False)

def chat_generator_run(self, *args, **kwargs):
raise Exception("OpenAI API request failed.")

monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run)

questions = ["Who created Python?", "What is the capital of France?"]
contexts = [["Python was created by Guido van Rossum."], ["Paris is the capital of France."]]
results = component.run(questions=questions, contexts=contexts)

# All queries failed: aggregate score should still be NaN (not a crash)
assert math.isnan(results["score"])
assert math.isnan(results["results"][0]["score"])
assert math.isnan(results["results"][1]["score"])

@pytest.mark.skipif(
Expand Down
24 changes: 22 additions & 2 deletions test/components/evaluators/test_faithfulness_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,17 +284,37 @@ def chat_generator_run(self, *args, **kwargs):
]
results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)

assert math.isnan(results["score"])
# Valid queries' scores are averaged; failed queries are excluded from the aggregate
assert results["score"] == 1.0

assert results["individual_scores"][0] == 1.0
assert math.isnan(results["individual_scores"][1])
assert math.isnan(results["individual_scores"][1]) # individual score preserved as NaN for transparency

assert results["results"][0] == {"statements": ["c", "d"], "statement_scores": [1, 1], "score": 1.0}

assert results["results"][1]["statements"] == []
assert results["results"][1]["statement_scores"] == []
assert math.isnan(results["results"][1]["score"])

def test_run_all_failed_returns_nan_score(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = FaithfulnessEvaluator(raise_on_failure=False)

def chat_generator_run(self, *args, **kwargs):
raise Exception("OpenAI API request failed.")

monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run)

questions = ["Who created Python?", "What is the capital of France?"]
contexts = [["Python was created by Guido van Rossum."], ["Paris is the capital of France."]]
predicted_answers = ["Guido van Rossum.", "Paris."]
results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)

# All queries failed: aggregate score should still be NaN (not a crash)
assert math.isnan(results["score"])
assert math.isnan(results["individual_scores"][0])
assert math.isnan(results["individual_scores"][1])

@pytest.mark.skipif(
not os.environ.get("OPENAI_API_KEY", None),
reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
Expand Down
Loading