From f87cef5d230e487ea3c4fecb26dd532813ac784b Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 22 Apr 2026 12:58:26 +0200 Subject: [PATCH 01/20] start migration --- integrations/ragas/pyproject.toml | 2 +- .../components/evaluators/ragas/evaluator.py | 148 +++++------ integrations/ragas/tests/test_evaluator.py | 229 ++++++++++-------- 3 files changed, 180 insertions(+), 199 deletions(-) diff --git a/integrations/ragas/pyproject.toml b/integrations/ragas/pyproject.toml index cfe2c9c0e7..a103c36337 100644 --- a/integrations/ragas/pyproject.toml +++ b/integrations/ragas/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai>=2.22.0", "ragas>=0.2.6,<0.3.0"] +dependencies = ["haystack-ai>=2.22.0", "ragas>=0.4.0"] [project.urls] Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/ragas" diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py index addb042807..c3a0a9c4eb 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py @@ -1,19 +1,13 @@ -import re +import inspect from typing import Any, Union, cast, get_args, get_origin from haystack import Document, component from haystack.dataclasses import ChatMessage from pydantic import ValidationError -from ragas import evaluate -from ragas.dataset_schema import ( - EvaluationDataset, - EvaluationResult, - SingleTurnSample, -) -from ragas.embeddings import BaseRagasEmbeddings -from ragas.llms import BaseRagasLLM -from ragas.metrics import Metric +from ragas.dataset_schema import SingleTurnSample +from ragas.metrics.base import SimpleBaseMetric +from ragas.metrics.result import MetricResult @component @@ -23,19 +17,21 @@ class RagasEvaluator: See the [Ragas framework](https://docs.ragas.io/) for more details. + This component supports the modern Ragas metrics API (`ragas.metrics.collections`). + Each metric must be a `SimpleBaseMetric` instance with its LLM configured at construction time. + Usage example: ```python - from haystack.components.generators import OpenAIGenerator + from openai import AsyncOpenAI + from ragas.llms.base import llm_factory + from ragas.metrics.collections import Faithfulness from haystack_integrations.components.evaluators.ragas import RagasEvaluator - from ragas.metrics import ContextPrecision - from ragas.llms import HaystackLLMWrapper - llm = OpenAIGenerator(model="gpt-4o-mini") - evaluator_llm = HaystackLLMWrapper(llm) + client = AsyncOpenAI() + llm = llm_factory("gpt-4o-mini", client=client) evaluator = RagasEvaluator( - ragas_metrics=[ContextPrecision()], - evaluator_llm=evaluator_llm + ragas_metrics=[Faithfulness(llm=llm)], ) output = evaluator.run( query="Which is the most popular global sport?", @@ -53,52 +49,28 @@ class RagasEvaluator: ``` """ - def __init__( - self, - ragas_metrics: list[Metric], - evaluator_llm: BaseRagasLLM | None = None, - evaluator_embedding: BaseRagasEmbeddings | None = None, - ) -> None: + def __init__(self, ragas_metrics: list[SimpleBaseMetric]) -> None: """ Constructs a new Ragas evaluator. - :param ragas_metrics: A list of evaluation metrics from the [Ragas](https://docs.ragas.io/) library. - :param evaluator_llm: A language model used by metrics that require LLMs for evaluation. - :param evaluator_embedding: An embedding model used by metrics that require embeddings for evaluation. + :param ragas_metrics: A list of modern Ragas metrics from `ragas.metrics.collections`. + Each metric must be fully configured (including its LLM) at construction time. """ - self._validate_inputs(ragas_metrics, evaluator_llm, evaluator_embedding) + self._validate_inputs(ragas_metrics) self.metrics = ragas_metrics - self.llm = evaluator_llm - self.embedding = evaluator_embedding - def _validate_inputs( - self, - metrics: list[Metric], - llm: BaseRagasLLM | None, - embedding: BaseRagasEmbeddings | None, - ) -> None: + def _validate_inputs(self, metrics: list[SimpleBaseMetric]) -> None: """ Validate input parameters. - :param metrics: List of Ragas metrics to validate - :param llm: Language model to validate - :param embedding: Embedding model to validate - + :param metrics: List of Ragas metrics to validate. :return: None. """ - if not all(isinstance(metric, Metric) for metric in metrics): - error_message = "All items in ragas_metrics must be instances of Metric class." - raise TypeError(error_message) - - if llm is not None and not isinstance(llm, BaseRagasLLM): - error_message = f"Expected evaluator_llm to be BaseRagasLLM, got {type(llm).__name__}" + if not all(isinstance(metric, SimpleBaseMetric) for metric in metrics): + error_message = "All items in ragas_metrics must be instances of SimpleBaseMetric." raise TypeError(error_message) - if embedding is not None and not isinstance(embedding, BaseRagasEmbeddings): - error_message = f"Expected evaluator_embedding to be BaseRagasEmbeddings, got {type(embedding).__name__}" - raise TypeError(error_message) - - @component.output_types(result=EvaluationResult) + @component.output_types(result=dict) def run( self, query: str | None = None, @@ -110,7 +82,7 @@ def run( rubrics: dict[str, str] | None = None, ) -> dict[str, Any]: """ - Evaluates the provided query against the documents and returns the evaluation result. + Evaluates the provided inputs against each metric and returns the results. :param query: The input query from the user. :param response: A list of ChatMessage responses (typically from a language model or agent). @@ -120,7 +92,7 @@ def run( :param reference: A string reference answer for the query. :param rubrics: A dictionary of evaluation rubric, where keys represent the score and the values represent the corresponding evaluation criteria. - :return: A dictionary containing the evaluation result. + :return: A dictionary with key ``result`` mapping metric names to their `MetricResult`. """ processed_docs = self._process_documents(documents) processed_response = self._process_response(response) @@ -135,30 +107,41 @@ def run( reference=reference, rubrics=rubrics, ) - except (ValueError, ValidationError) as e: self._handle_conversion_error(e) - dataset = EvaluationDataset([sample]) + results: dict[str, MetricResult] = {} + for metric in self.metrics: + results[metric.name] = self._score_metric(metric, sample) - try: - result = evaluate( - dataset=dataset, - metrics=self.metrics, - llm=self.llm, - embeddings=self.embedding, - ) - except (ValueError, ValidationError) as e: - self._handle_evaluation_error(e) + return {"result": results} + + def _score_metric(self, metric: SimpleBaseMetric, sample: SingleTurnSample) -> MetricResult: + """ + Score a metric by inspecting its ascore() signature and passing only matching sample fields. - return {"result": result} + :param metric: A SimpleBaseMetric instance to score. + :param sample: The SingleTurnSample holding all available input fields. + :return: MetricResult from the metric. + """ + sig = inspect.signature(metric.ascore) + excluded = {"self", "callbacks"} + valid_params = { + name + for name, param in sig.parameters.items() + if name not in excluded + and param.kind not in (inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD) + } + sample_dict = sample.model_dump() + kwargs = {k: v for k, v in sample_dict.items() if k in valid_params and v is not None} + return metric.score(**kwargs) def _process_documents(self, documents: list[Document | str] | None) -> list[str] | None: """ Process and validate input documents. - :param documents: List of Documents or strings to process - :return: List of document contents as strings or None + :param documents: List of Documents or strings to process. + :return: List of document contents as strings or None. """ if documents is None: return None @@ -178,10 +161,10 @@ def _process_response(self, response: list[ChatMessage] | str | None) -> str | N """ Process response into expected format. - :param response: Response to process - :return: None or Processed response string + :param response: Response to process. + :return: None or processed response string. """ - if isinstance(response, list): # Check if response is a list + if isinstance(response, list): if all(isinstance(item, ChatMessage) and item.text for item in response): return response[0].text return None @@ -191,9 +174,9 @@ def _process_response(self, response: list[ChatMessage] | str | None) -> str | N def _handle_conversion_error(self, error: Exception) -> None: """ - Handle evaluation errors with improved messages. + Re-raise pydantic validation errors from SingleTurnSample with Haystack-friendly field names. - :params error: Original error + :params error: Original error. """ if isinstance(error, ValidationError): field_mapping = { @@ -217,26 +200,6 @@ def _handle_conversion_error(self, error: Exception) -> None: ) raise ValueError(error_message) - def _handle_evaluation_error(self, error: Exception) -> None: - error_message = str(error) - columns_match = re.search(r"additional columns \[(.*?)\]", error_message) - field_mapping = { - "user_input": "query", - "retrieved_contexts": "documents", - } - if columns_match: - columns_str = columns_match.group(1) - columns = [col.strip().strip("'") for col in columns_str.split(",")] - - mapped_columns = [field_mapping.get(col, col) for col in columns] - updated_columns_str = "[" + ", ".join(f"'{col}'" for col in mapped_columns) + "]" - - # Update the list of columns in the error message - updated_error_message = error_message.replace( - columns_match.group(0), f"additional columns {updated_columns_str}" - ) - raise ValueError(updated_error_message) - def _get_expected_type_description(self, expected_type: Any) -> str: """Helper method to get a description of the expected type.""" if get_origin(expected_type) is Union: @@ -252,14 +215,13 @@ def _get_expected_type_description(self, expected_type: Any) -> str: value_type_name = getattr(value_type, "__name__", str(value_type)) return f"a dictionary with keys of type {key_type_name} and values of type {value_type_name}" else: - # Handle non-generic types or unknown types gracefully return getattr(expected_type, "__name__", str(expected_type)) def _get_example_input(self, field: str) -> str: """ Helper method to get an example input based on the field. - :param field: Arguement used to make SingleTurnSample. + :param field: Argument used to make SingleTurnSample. :returns: Example usage for the field. """ examples = { diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py index 1929453726..c801eec9b1 100644 --- a/integrations/ragas/tests/test_evaluator.py +++ b/integrations/ragas/tests/test_evaluator.py @@ -1,83 +1,149 @@ import pytest -from unittest import mock from unittest.mock import MagicMock -from ragas.metrics import Metric, Faithfulness -from ragas.llms import BaseRagasLLM -from ragas.embeddings import BaseRagasEmbeddings -from ragas.dataset_schema import EvaluationResult +from ragas.metrics.base import SimpleBaseMetric +from ragas.metrics.result import MetricResult from haystack import Document from haystack_integrations.components.evaluators.ragas import RagasEvaluator -# Fixtures -@pytest.fixture -def mock_run(): - """Fixture to mock the 'run' method of RagasEvaluator.""" - with mock.patch.object(RagasEvaluator, 'run') as mock_method: - yield mock_method - - -@pytest.fixture -def ragas_evaluator(): - """Fixture to create a valid RagasEvaluator instance.""" - valid_metrics = [MagicMock(spec=Metric) for _ in range(3)] - valid_llm = MagicMock(spec=BaseRagasLLM) - valid_embedding = MagicMock(spec=BaseRagasEmbeddings) - return RagasEvaluator( - ragas_metrics=valid_metrics, - evaluator_llm=valid_llm, - evaluator_embedding=valid_embedding, +def _make_metric(name: str, score: float = 0.8, reason: str = "test reason") -> MagicMock: + """Create a mock SimpleBaseMetric with a concrete ascore signature for inspect.signature.""" + metric = MagicMock(spec=SimpleBaseMetric) + metric.name = name + metric.score.return_value = MetricResult(value=score, reason=reason) + + async def ascore(user_input: str, response: str, retrieved_contexts: list) -> MetricResult: + return MetricResult(value=score, reason=reason) + + metric.ascore = ascore + return metric + + +# --- Initialization --- + +def test_successful_initialization(): + metric = _make_metric("faithfulness") + evaluator = RagasEvaluator(ragas_metrics=[metric]) + assert evaluator.metrics == [metric] + + +def test_initialization_with_multiple_metrics(): + metrics = [_make_metric("faithfulness"), _make_metric("answer_relevancy")] + evaluator = RagasEvaluator(ragas_metrics=metrics) + assert len(evaluator.metrics) == 2 + + +def test_invalid_metrics_raises_type_error(): + with pytest.raises(TypeError, match="All items in ragas_metrics must be instances of SimpleBaseMetric."): + RagasEvaluator(ragas_metrics=["not_a_metric"]) + + +def test_invalid_metrics_mixed_raises_type_error(): + """Even one non-SimpleBaseMetric in the list should fail.""" + valid = _make_metric("faithfulness") + with pytest.raises(TypeError): + RagasEvaluator(ragas_metrics=[valid, "not_a_metric"]) + + +# --- run() — result structure --- + +def test_run_returns_metric_results_keyed_by_name(): + metric = _make_metric("faithfulness", score=0.9) + evaluator = RagasEvaluator(ragas_metrics=[metric]) + + output = evaluator.run( + query="Which is the most popular global sport?", + response="Football is the most popular sport.", + documents=["Football is undoubtedly the world's most popular sport."], ) + assert "result" in output + assert "faithfulness" in output["result"] + result = output["result"]["faithfulness"] + assert isinstance(result, MetricResult) + assert result.value == 0.9 + + +def test_run_scores_all_metrics(): + metrics = [_make_metric("faithfulness", 0.9), _make_metric("answer_relevancy", 0.7)] + evaluator = RagasEvaluator(ragas_metrics=metrics) + + output = evaluator.run(query="test?", response="answer", documents=["doc"]) + + assert set(output["result"].keys()) == {"faithfulness", "answer_relevancy"} + assert output["result"]["faithfulness"].value == 0.9 + assert output["result"]["answer_relevancy"].value == 0.7 + + +def test_run_calls_score_on_each_metric(): + metric_a = _make_metric("faithfulness") + metric_b = _make_metric("answer_relevancy") + evaluator = RagasEvaluator(ragas_metrics=[metric_a, metric_b]) + + evaluator.run(query="test?", response="answer", documents=["doc"]) + + metric_a.score.assert_called_once() + metric_b.score.assert_called_once() -# Tests -def test_successful_initialization(ragas_evaluator): - """Test RagasEvaluator initializes correctly with valid inputs.""" - assert len(ragas_evaluator.metrics) == 3 - assert isinstance(ragas_evaluator.llm, BaseRagasLLM) - assert isinstance(ragas_evaluator.embedding, BaseRagasEmbeddings) +# --- run() — parameter filtering --- -def test_invalid_metrics(): - """Test RagasEvaluator raises TypeError for invalid metrics.""" - invalid_metric = "not_a_metric" +def test_score_metric_passes_only_matching_params(): + """Metric that only needs user_input + response should not receive retrieved_contexts.""" + metric = MagicMock(spec=SimpleBaseMetric) + metric.name = "selective_metric" + metric.score.return_value = MetricResult(value=0.5, reason="ok") - with pytest.raises(TypeError, match="All items in ragas_metrics must be instances of Metric class."): - RagasEvaluator(ragas_metrics=[invalid_metric]) + async def ascore(user_input: str, response: str) -> MetricResult: + return MetricResult(value=0.5, reason="ok") + metric.ascore = ascore -def test_invalid_llm(): - """Test RagasEvaluator raises TypeError for invalid evaluator_llm.""" - valid_metric = MagicMock(spec=Metric) - invalid_llm = "not_a_llm" + evaluator = RagasEvaluator(ragas_metrics=[metric]) + evaluator.run(query="test?", response="answer", documents=["doc"], reference="ref") - with pytest.raises(TypeError, match="Expected evaluator_llm to be BaseRagasLLM"): - RagasEvaluator(ragas_metrics=[valid_metric], evaluator_llm=invalid_llm) + metric.score.assert_called_once_with(user_input="test?", response="answer") -def test_invalid_embedding(): - """Test RagasEvaluator raises TypeError for invalid evaluator_embedding.""" - valid_metric = MagicMock(spec=Metric) - invalid_embedding = "not_an_embedding" +def test_score_metric_omits_none_fields(): + """None fields are not forwarded even if they match the signature.""" + metric = _make_metric("faithfulness") + evaluator = RagasEvaluator(ragas_metrics=[metric]) - with pytest.raises(TypeError, match="Expected evaluator_embedding to be BaseRagasEmbeddings"): - RagasEvaluator(ragas_metrics=[valid_metric], evaluator_embedding=invalid_embedding) + evaluator.run(query="test?", response="answer") # no documents → retrieved_contexts=None + _, kwargs = metric.score.call_args + assert "retrieved_contexts" not in kwargs -def test_initializer_allows_optional_llm_and_embeddings(): - """Test RagasEvaluator initializes correctly with None for optional parameters.""" - valid_metric = MagicMock(spec=Metric) - evaluator = RagasEvaluator( - ragas_metrics=[valid_metric], - evaluator_llm=None, - evaluator_embedding=None, +# --- run() — input processing --- + +def test_run_accepts_document_objects(): + metric = _make_metric("faithfulness") + evaluator = RagasEvaluator(ragas_metrics=[metric]) + + evaluator.run( + query="test?", + response="answer", + documents=[Document(content="some content"), Document(content="more content")], ) - assert evaluator.metrics == [valid_metric] - assert evaluator.llm is None - assert evaluator.embedding is None + + _, kwargs = metric.score.call_args + assert kwargs["retrieved_contexts"] == ["some content", "more content"] + + +def test_run_accepts_string_documents(): + metric = _make_metric("faithfulness") + evaluator = RagasEvaluator(ragas_metrics=[metric]) + + evaluator.run(query="test?", response="answer", documents=["doc one", "doc two"]) + + _, kwargs = metric.score.call_args + assert kwargs["retrieved_contexts"] == ["doc one", "doc two"] +# --- run() — input validation errors --- + @pytest.mark.parametrize( "invalid_input,field_name,error_message", [ @@ -86,9 +152,8 @@ def test_initializer_allows_optional_llm_and_embeddings(): (["score_1"], "rubrics", "'rubrics' field expected"), ], ) -def test_run_invalid_inputs(invalid_input, field_name, error_message): - """Test RagasEvaluator raises ValueError for invalid input types.""" - evaluator = RagasEvaluator(ragas_metrics=[Faithfulness()]) +def test_run_raises_on_invalid_input_types(invalid_input, field_name, error_message): + evaluator = RagasEvaluator(ragas_metrics=[_make_metric("faithfulness")]) query = "Which is the most popular global sport?" documents = ["Football is the most popular sport."] response = "Football is the most popular sport in the world" @@ -102,49 +167,3 @@ def test_run_invalid_inputs(invalid_input, field_name, error_message): evaluator.run(query=query, rubrics=invalid_input, documents=documents, response=response) assert error_message in str(exc_info.value) - - -def test_missing_columns_in_dataset(): - """Test if RagasEvaluator raises a ValueError when required columns are missing for a specific metric.""" - evaluator = RagasEvaluator(ragas_metrics=[Faithfulness()]) - query = "Which is the most popular global sport?" - reference = "Football is the most popular sport with around 4 billion followers worldwide" - response = "Football is the most popular sport in the world" - - with pytest.raises(ValueError) as exc_info: - evaluator.run(query=query, reference=reference, response=response) - - assert "faithfulness" in str(exc_info.value) - assert "documents" in str(exc_info.value) - - -def test_run_valid_input(mock_run): - """Test RagasEvaluator runs successfully with valid input.""" - mock_run.return_value = {"result": {"score": MagicMock(), "details": MagicMock(spec=EvaluationResult)}} - evaluator = RagasEvaluator(ragas_metrics=[MagicMock(Metric)]) - - query = "Which is the most popular global sport?" - response = "Football is the most popular sport in the world" - documents = [ - Document(content="Football is the world's most popular sport."), - Document(content="Football has over 4 billion followers."), - ] - reference_contexts = ["Football is a globally popular sport."] - multi_responses = ["Football is considered the most popular sport."] - reference = "Football is the most popular sport with around 4 billion followers worldwide" - rubrics = {"accuracy": "high", "relevance": "high"} - - output = evaluator.run( - query=query, - response=response, - documents=documents, - reference_contexts=reference_contexts, - multi_responses=multi_responses, - reference=reference, - rubrics=rubrics, - ) - - assert "result" in output - assert isinstance(output["result"], dict) - assert "score" in output["result"] - assert isinstance(output["result"]["details"], EvaluationResult) From 475707365672132770499e8a7f3090635a14833e Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 22 Apr 2026 13:04:26 +0200 Subject: [PATCH 02/20] update import path --- .../components/evaluators/ragas/evaluator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py index c3a0a9c4eb..2938670233 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py @@ -23,7 +23,7 @@ class RagasEvaluator: Usage example: ```python from openai import AsyncOpenAI - from ragas.llms.base import llm_factory + from ragas.llms import llm_factory from ragas.metrics.collections import Faithfulness from haystack_integrations.components.evaluators.ragas import RagasEvaluator From a7b74d48eff142e72ec20f74073c772bb8e6a268 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 22 Apr 2026 13:07:21 +0200 Subject: [PATCH 03/20] test refactoring --- integrations/ragas/tests/test_evaluator.py | 218 ++++++++++----------- 1 file changed, 102 insertions(+), 116 deletions(-) diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py index c801eec9b1..ec7ca97ab9 100644 --- a/integrations/ragas/tests/test_evaluator.py +++ b/integrations/ragas/tests/test_evaluator.py @@ -19,151 +19,137 @@ async def ascore(user_input: str, response: str, retrieved_contexts: list) -> Me return metric -# --- Initialization --- +class TestInitialization: + def test_successful_initialization(self): + metric = _make_metric("faithfulness") + evaluator = RagasEvaluator(ragas_metrics=[metric]) + assert evaluator.metrics == [metric] -def test_successful_initialization(): - metric = _make_metric("faithfulness") - evaluator = RagasEvaluator(ragas_metrics=[metric]) - assert evaluator.metrics == [metric] + def test_initialization_with_multiple_metrics(self): + metrics = [_make_metric("faithfulness"), _make_metric("answer_relevancy")] + evaluator = RagasEvaluator(ragas_metrics=metrics) + assert len(evaluator.metrics) == 2 + def test_invalid_metrics_raises_type_error(self): + with pytest.raises(TypeError, match="All items in ragas_metrics must be instances of SimpleBaseMetric."): + RagasEvaluator(ragas_metrics=["not_a_metric"]) -def test_initialization_with_multiple_metrics(): - metrics = [_make_metric("faithfulness"), _make_metric("answer_relevancy")] - evaluator = RagasEvaluator(ragas_metrics=metrics) - assert len(evaluator.metrics) == 2 + def test_invalid_metrics_mixed_raises_type_error(self): + valid = _make_metric("faithfulness") + with pytest.raises(TypeError): + RagasEvaluator(ragas_metrics=[valid, "not_a_metric"]) -def test_invalid_metrics_raises_type_error(): - with pytest.raises(TypeError, match="All items in ragas_metrics must be instances of SimpleBaseMetric."): - RagasEvaluator(ragas_metrics=["not_a_metric"]) +class TestRunResultStructure: + def test_run_returns_metric_results_keyed_by_name(self): + metric = _make_metric("faithfulness", score=0.9) + evaluator = RagasEvaluator(ragas_metrics=[metric]) + output = evaluator.run( + query="Which is the most popular global sport?", + response="Football is the most popular sport.", + documents=["Football is undoubtedly the world's most popular sport."], + ) -def test_invalid_metrics_mixed_raises_type_error(): - """Even one non-SimpleBaseMetric in the list should fail.""" - valid = _make_metric("faithfulness") - with pytest.raises(TypeError): - RagasEvaluator(ragas_metrics=[valid, "not_a_metric"]) + assert "result" in output + assert "faithfulness" in output["result"] + result = output["result"]["faithfulness"] + assert isinstance(result, MetricResult) + assert result.value == 0.9 + def test_run_scores_all_metrics(self): + metrics = [_make_metric("faithfulness", 0.9), _make_metric("answer_relevancy", 0.7)] + evaluator = RagasEvaluator(ragas_metrics=metrics) -# --- run() — result structure --- + output = evaluator.run(query="test?", response="answer", documents=["doc"]) -def test_run_returns_metric_results_keyed_by_name(): - metric = _make_metric("faithfulness", score=0.9) - evaluator = RagasEvaluator(ragas_metrics=[metric]) + assert set(output["result"].keys()) == {"faithfulness", "answer_relevancy"} + assert output["result"]["faithfulness"].value == 0.9 + assert output["result"]["answer_relevancy"].value == 0.7 - output = evaluator.run( - query="Which is the most popular global sport?", - response="Football is the most popular sport.", - documents=["Football is undoubtedly the world's most popular sport."], - ) - - assert "result" in output - assert "faithfulness" in output["result"] - result = output["result"]["faithfulness"] - assert isinstance(result, MetricResult) - assert result.value == 0.9 + def test_run_calls_score_on_each_metric(self): + metric_a = _make_metric("faithfulness") + metric_b = _make_metric("answer_relevancy") + evaluator = RagasEvaluator(ragas_metrics=[metric_a, metric_b]) + evaluator.run(query="test?", response="answer", documents=["doc"]) -def test_run_scores_all_metrics(): - metrics = [_make_metric("faithfulness", 0.9), _make_metric("answer_relevancy", 0.7)] - evaluator = RagasEvaluator(ragas_metrics=metrics) + metric_a.score.assert_called_once() + metric_b.score.assert_called_once() - output = evaluator.run(query="test?", response="answer", documents=["doc"]) - assert set(output["result"].keys()) == {"faithfulness", "answer_relevancy"} - assert output["result"]["faithfulness"].value == 0.9 - assert output["result"]["answer_relevancy"].value == 0.7 +class TestRunParameterFiltering: + def test_score_metric_passes_only_matching_params(self): + """Metric that only needs user_input + response should not receive retrieved_contexts.""" + metric = MagicMock(spec=SimpleBaseMetric) + metric.name = "selective_metric" + metric.score.return_value = MetricResult(value=0.5, reason="ok") + async def ascore(user_input: str, response: str) -> MetricResult: + return MetricResult(value=0.5, reason="ok") -def test_run_calls_score_on_each_metric(): - metric_a = _make_metric("faithfulness") - metric_b = _make_metric("answer_relevancy") - evaluator = RagasEvaluator(ragas_metrics=[metric_a, metric_b]) + metric.ascore = ascore - evaluator.run(query="test?", response="answer", documents=["doc"]) + evaluator = RagasEvaluator(ragas_metrics=[metric]) + evaluator.run(query="test?", response="answer", documents=["doc"], reference="ref") - metric_a.score.assert_called_once() - metric_b.score.assert_called_once() + metric.score.assert_called_once_with(user_input="test?", response="answer") + def test_score_metric_omits_none_fields(self): + metric = _make_metric("faithfulness") + evaluator = RagasEvaluator(ragas_metrics=[metric]) -# --- run() — parameter filtering --- + evaluator.run(query="test?", response="answer") # no documents → retrieved_contexts=None -def test_score_metric_passes_only_matching_params(): - """Metric that only needs user_input + response should not receive retrieved_contexts.""" - metric = MagicMock(spec=SimpleBaseMetric) - metric.name = "selective_metric" - metric.score.return_value = MetricResult(value=0.5, reason="ok") + _, kwargs = metric.score.call_args + assert "retrieved_contexts" not in kwargs - async def ascore(user_input: str, response: str) -> MetricResult: - return MetricResult(value=0.5, reason="ok") - - metric.ascore = ascore - evaluator = RagasEvaluator(ragas_metrics=[metric]) - evaluator.run(query="test?", response="answer", documents=["doc"], reference="ref") +class TestRunInputProcessing: + def test_run_accepts_document_objects(self): + metric = _make_metric("faithfulness") + evaluator = RagasEvaluator(ragas_metrics=[metric]) - metric.score.assert_called_once_with(user_input="test?", response="answer") + evaluator.run( + query="test?", + response="answer", + documents=[Document(content="some content"), Document(content="more content")], + ) + _, kwargs = metric.score.call_args + assert kwargs["retrieved_contexts"] == ["some content", "more content"] -def test_score_metric_omits_none_fields(): - """None fields are not forwarded even if they match the signature.""" - metric = _make_metric("faithfulness") - evaluator = RagasEvaluator(ragas_metrics=[metric]) + def test_run_accepts_string_documents(self): + metric = _make_metric("faithfulness") + evaluator = RagasEvaluator(ragas_metrics=[metric]) - evaluator.run(query="test?", response="answer") # no documents → retrieved_contexts=None + evaluator.run(query="test?", response="answer", documents=["doc one", "doc two"]) - _, kwargs = metric.score.call_args - assert "retrieved_contexts" not in kwargs + _, kwargs = metric.score.call_args + assert kwargs["retrieved_contexts"] == ["doc one", "doc two"] -# --- run() — input processing --- - -def test_run_accepts_document_objects(): - metric = _make_metric("faithfulness") - evaluator = RagasEvaluator(ragas_metrics=[metric]) - - evaluator.run( - query="test?", - response="answer", - documents=[Document(content="some content"), Document(content="more content")], +class TestRunInputValidation: + @pytest.mark.parametrize( + "invalid_input,field_name,error_message", + [ + (["Invalid query type"], "query", "'query' field expected"), + ([123, ["Invalid document"]], "documents", "'documents' must be a list"), + (["score_1"], "rubrics", "'rubrics' field expected"), + ], ) - - _, kwargs = metric.score.call_args - assert kwargs["retrieved_contexts"] == ["some content", "more content"] - - -def test_run_accepts_string_documents(): - metric = _make_metric("faithfulness") - evaluator = RagasEvaluator(ragas_metrics=[metric]) - - evaluator.run(query="test?", response="answer", documents=["doc one", "doc two"]) - - _, kwargs = metric.score.call_args - assert kwargs["retrieved_contexts"] == ["doc one", "doc two"] - - -# --- run() — input validation errors --- - -@pytest.mark.parametrize( - "invalid_input,field_name,error_message", - [ - (["Invalid query type"], "query", "'query' field expected"), - ([123, ["Invalid document"]], "documents", "'documents' must be a list"), - (["score_1"], "rubrics", "'rubrics' field expected"), - ], -) -def test_run_raises_on_invalid_input_types(invalid_input, field_name, error_message): - evaluator = RagasEvaluator(ragas_metrics=[_make_metric("faithfulness")]) - query = "Which is the most popular global sport?" - documents = ["Football is the most popular sport."] - response = "Football is the most popular sport in the world" - - with pytest.raises(ValueError) as exc_info: - if field_name == "query": - evaluator.run(query=invalid_input, documents=documents, response=response) - elif field_name == "documents": - evaluator.run(query=query, documents=invalid_input, response=response) - elif field_name == "rubrics": - evaluator.run(query=query, rubrics=invalid_input, documents=documents, response=response) - - assert error_message in str(exc_info.value) + def test_run_raises_on_invalid_input_types(self, invalid_input, field_name, error_message): + evaluator = RagasEvaluator(ragas_metrics=[_make_metric("faithfulness")]) + query = "Which is the most popular global sport?" + documents = ["Football is the most popular sport."] + response = "Football is the most popular sport in the world" + + with pytest.raises(ValueError) as exc_info: + if field_name == "query": + evaluator.run(query=invalid_input, documents=documents, response=response) + elif field_name == "documents": + evaluator.run(query=query, documents=invalid_input, response=response) + elif field_name == "rubrics": + evaluator.run(query=query, rubrics=invalid_input, documents=documents, response=response) + + assert error_message in str(exc_info.value) From 50bad6aad7b234a5c46dc52a932b55e989290842 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 22 Apr 2026 13:17:04 +0200 Subject: [PATCH 04/20] add integration test --- integrations/ragas/pyproject.toml | 7 + integrations/ragas/tests/test_evaluator.py | 153 +++++++++++++++++++++ 2 files changed, 160 insertions(+) diff --git a/integrations/ragas/pyproject.toml b/integrations/ragas/pyproject.toml index a103c36337..0d81a347f4 100644 --- a/integrations/ragas/pyproject.toml +++ b/integrations/ragas/pyproject.toml @@ -164,3 +164,10 @@ parallel = false omit = ["*/tests/*", "*/__init__.py"] show_missing = true exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] + +[tool.pytest.ini_options] +addopts = "--strict-markers" +markers = [ + "integration: integration tests", +] +log_cli = true diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py index ec7ca97ab9..7db45d5af0 100644 --- a/integrations/ragas/tests/test_evaluator.py +++ b/integrations/ragas/tests/test_evaluator.py @@ -1,3 +1,5 @@ +import os + import pytest from unittest.mock import MagicMock from ragas.metrics.base import SimpleBaseMetric @@ -153,3 +155,154 @@ def test_run_raises_on_invalid_input_types(self, invalid_input, field_name, erro evaluator.run(query=query, rubrics=invalid_input, documents=documents, response=response) assert error_message in str(exc_info.value) + + +@pytest.mark.skipif(not os.environ.get("OPENAI_API_KEY"), reason="Set OPENAI_API_KEY to run integration tests.") +@pytest.mark.integration +class TestStandaloneEvaluationIntegration: + def _make_llm(self): + from openai import AsyncOpenAI + from ragas.llms import llm_factory + + return llm_factory("gpt-4o-mini", client=AsyncOpenAI()) + + def test_faithfulness_returns_valid_score(self): + from ragas.metrics.collections import Faithfulness + + evaluator = RagasEvaluator(ragas_metrics=[Faithfulness(llm=self._make_llm())]) + + output = evaluator.run( + query="What makes Meta AI's LLaMA models stand out?", + response="Meta AI's LLaMA models stand out for being open-source.", + documents=[ + "Meta AI is best known for its LLaMA series, which has been made open-source " + "for researchers and developers. LLaMA models are praised for their ability to " + "support innovation and experimentation due to their accessibility." + ], + ) + + result = output["result"]["faithfulness"] + assert isinstance(result, MetricResult) + assert 0.0 <= result.value <= 1.0 + + def test_answer_relevancy_uses_only_query_and_response(self): + """AnswerRelevancy only declares user_input + response in ascore — documents should not be forwarded.""" + from ragas.metrics.collections import AnswerRelevancy + + evaluator = RagasEvaluator(ragas_metrics=[AnswerRelevancy(llm=self._make_llm())]) + + output = evaluator.run( + query="What makes Meta AI's LLaMA models stand out?", + response="They are open-source and freely available to researchers.", + documents=["Meta AI released LLaMA as an open-source model."], + ) + + result = output["result"]["answer_relevancy"] + assert isinstance(result, MetricResult) + assert 0.0 <= result.value <= 1.0 + + def test_multiple_metrics_all_return_results(self): + from ragas.metrics.collections import AnswerRelevancy, ContextPrecision, Faithfulness + + llm = self._make_llm() + evaluator = RagasEvaluator( + ragas_metrics=[ + Faithfulness(llm=llm), + AnswerRelevancy(llm=llm), + ContextPrecision(llm=llm), + ] + ) + + output = evaluator.run( + query="What makes Meta AI's LLaMA models stand out?", + response=( + "Meta AI's LLaMA models stand out for being open-source, supporting " + "innovation and experimentation due to their accessibility and strong performance." + ), + documents=[ + "Meta AI is best known for its LLaMA series, which has been made open-source.", + "Meta AI with its LLaMA models aims to democratize AI development by making " + "high-quality models available for free, fostering collaboration across industries.", + ], + reference=( + "Meta AI's LLaMA models stand out for being open-source, supporting innovation " + "and experimentation due to their accessibility and strong performance." + ), + ) + + assert set(output["result"].keys()) == {"faithfulness", "answer_relevancy", "context_precision"} + for metric_result in output["result"].values(): + assert isinstance(metric_result, MetricResult) + assert 0.0 <= metric_result.value <= 1.0 + + +@_OPENAI_SKIP +@pytest.mark.integration +class TestPipelineIntegration: + def test_ragas_evaluator_in_rag_pipeline(self): + from openai import AsyncOpenAI + from ragas.llms import llm_factory + from ragas.metrics.collections import Faithfulness + from haystack import Pipeline + from haystack.components.builders import AnswerBuilder, ChatPromptBuilder + from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder + from haystack.components.generators.chat import OpenAIChatGenerator + from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever + from haystack.dataclasses import ChatMessage + from haystack.document_stores.in_memory import InMemoryDocumentStore + + dataset = [ + "Meta AI is best known for its LLaMA series, which has been made open-source " + "for researchers and developers.", + "LLaMA models are praised for their ability to support innovation and " + "experimentation due to their accessibility and strong performance.", + "Meta AI with its LLaMA models aims to democratize AI development by making " + "high-quality models available for free.", + ] + + document_store = InMemoryDocumentStore() + docs = [Document(content=text) for text in dataset] + document_embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small") + document_store.write_documents(document_embedder.run(docs)["documents"]) + + llm = llm_factory("gpt-4o-mini", client=AsyncOpenAI()) + ragas_evaluator = RagasEvaluator(ragas_metrics=[Faithfulness(llm=llm)]) + + template = [ + ChatMessage.from_user( + "Answer the question based on the context.\n\n" + "Context:\n{% for document in documents %}{{ document.content }}\n{% endfor %}\n\n" + "Question: {{question}}\nAnswer:" + ) + ] + + pipeline = Pipeline() + pipeline.add_component("text_embedder", OpenAITextEmbedder(model="text-embedding-3-small")) + pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=2)) + pipeline.add_component("prompt_builder", ChatPromptBuilder(template=template, required_variables="*")) + pipeline.add_component("llm", OpenAIChatGenerator(model="gpt-4o-mini")) + pipeline.add_component("answer_builder", AnswerBuilder()) + pipeline.add_component("ragas_evaluator", ragas_evaluator) + + pipeline.connect("text_embedder.embedding", "retriever.query_embedding") + pipeline.connect("retriever", "prompt_builder") + pipeline.connect("prompt_builder.prompt", "llm.messages") + pipeline.connect("llm.replies", "answer_builder.replies") + pipeline.connect("retriever", "answer_builder.documents") + pipeline.connect("retriever", "ragas_evaluator.documents") + pipeline.connect("llm.replies", "ragas_evaluator.response") + + question = "What makes Meta AI's LLaMA models stand out?" + result = pipeline.run( + { + "text_embedder": {"text": question}, + "prompt_builder": {"question": question}, + "answer_builder": {"query": question}, + "ragas_evaluator": {"query": question}, + } + ) + + assert "ragas_evaluator" in result + faithfulness_result = result["ragas_evaluator"]["result"]["faithfulness"] + assert isinstance(faithfulness_result, MetricResult) + assert 0.0 <= faithfulness_result.value <= 1.0 From 3c95220ce590895fc6cc2eec139760abf7b87b7b Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 22 Apr 2026 13:18:11 +0200 Subject: [PATCH 05/20] fix pytest marker --- integrations/ragas/tests/test_evaluator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py index 7db45d5af0..4e23b815b4 100644 --- a/integrations/ragas/tests/test_evaluator.py +++ b/integrations/ragas/tests/test_evaluator.py @@ -236,7 +236,7 @@ def test_multiple_metrics_all_return_results(self): assert 0.0 <= metric_result.value <= 1.0 -@_OPENAI_SKIP +@pytest.mark.skipif(not os.environ.get("OPENAI_API_KEY"), reason="Set OPENAI_API_KEY to run integration tests.") @pytest.mark.integration class TestPipelineIntegration: def test_ragas_evaluator_in_rag_pipeline(self): From 22e3fcca623931655a348479c1e00fb599773d5d Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 22 Apr 2026 13:24:19 +0200 Subject: [PATCH 06/20] fix integration tests --- integrations/ragas/tests/test_evaluator.py | 47 ++++++++++------------ 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py index 4e23b815b4..0adfbcfe6a 100644 --- a/integrations/ragas/tests/test_evaluator.py +++ b/integrations/ragas/tests/test_evaluator.py @@ -1,10 +1,20 @@ import os import pytest +from openai import AsyncOpenAI from unittest.mock import MagicMock +from ragas.embeddings.base import embedding_factory +from ragas.llms import llm_factory from ragas.metrics.base import SimpleBaseMetric +from ragas.metrics.collections import AnswerRelevancy, ContextPrecision, Faithfulness from ragas.metrics.result import MetricResult -from haystack import Document +from haystack import Document, Pipeline +from haystack.components.builders import AnswerBuilder, ChatPromptBuilder +from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever +from haystack.dataclasses import ChatMessage +from haystack.document_stores.in_memory import InMemoryDocumentStore from haystack_integrations.components.evaluators.ragas import RagasEvaluator @@ -161,14 +171,12 @@ def test_run_raises_on_invalid_input_types(self, invalid_input, field_name, erro @pytest.mark.integration class TestStandaloneEvaluationIntegration: def _make_llm(self): - from openai import AsyncOpenAI - from ragas.llms import llm_factory - return llm_factory("gpt-4o-mini", client=AsyncOpenAI()) - def test_faithfulness_returns_valid_score(self): - from ragas.metrics.collections import Faithfulness + def _make_embeddings(self): + return embedding_factory("openai", model="text-embedding-3-small", client=AsyncOpenAI()) + def test_faithfulness_returns_valid_score(self): evaluator = RagasEvaluator(ragas_metrics=[Faithfulness(llm=self._make_llm())]) output = evaluator.run( @@ -187,9 +195,9 @@ def test_faithfulness_returns_valid_score(self): def test_answer_relevancy_uses_only_query_and_response(self): """AnswerRelevancy only declares user_input + response in ascore — documents should not be forwarded.""" - from ragas.metrics.collections import AnswerRelevancy - - evaluator = RagasEvaluator(ragas_metrics=[AnswerRelevancy(llm=self._make_llm())]) + evaluator = RagasEvaluator( + ragas_metrics=[AnswerRelevancy(llm=self._make_llm(), embeddings=self._make_embeddings())] + ) output = evaluator.run( query="What makes Meta AI's LLaMA models stand out?", @@ -202,13 +210,12 @@ def test_answer_relevancy_uses_only_query_and_response(self): assert 0.0 <= result.value <= 1.0 def test_multiple_metrics_all_return_results(self): - from ragas.metrics.collections import AnswerRelevancy, ContextPrecision, Faithfulness - llm = self._make_llm() + embeddings = self._make_embeddings() evaluator = RagasEvaluator( ragas_metrics=[ Faithfulness(llm=llm), - AnswerRelevancy(llm=llm), + AnswerRelevancy(llm=llm, embeddings=embeddings), ContextPrecision(llm=llm), ] ) @@ -240,17 +247,6 @@ def test_multiple_metrics_all_return_results(self): @pytest.mark.integration class TestPipelineIntegration: def test_ragas_evaluator_in_rag_pipeline(self): - from openai import AsyncOpenAI - from ragas.llms import llm_factory - from ragas.metrics.collections import Faithfulness - from haystack import Pipeline - from haystack.components.builders import AnswerBuilder, ChatPromptBuilder - from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder - from haystack.components.generators.chat import OpenAIChatGenerator - from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever - from haystack.dataclasses import ChatMessage - from haystack.document_stores.in_memory import InMemoryDocumentStore - dataset = [ "Meta AI is best known for its LLaMA series, which has been made open-source " "for researchers and developers.", @@ -265,8 +261,9 @@ def test_ragas_evaluator_in_rag_pipeline(self): document_embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small") document_store.write_documents(document_embedder.run(docs)["documents"]) - llm = llm_factory("gpt-4o-mini", client=AsyncOpenAI()) - ragas_evaluator = RagasEvaluator(ragas_metrics=[Faithfulness(llm=llm)]) + ragas_evaluator = RagasEvaluator( + ragas_metrics=[Faithfulness(llm=llm_factory("gpt-4o-mini", client=AsyncOpenAI()))] + ) template = [ ChatMessage.from_user( From 0fe9011f47451034d6fa94420b18d9f2c6aaad81 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 22 Apr 2026 13:45:38 +0200 Subject: [PATCH 07/20] Add se/de methods --- .../components/evaluators/ragas/evaluator.py | 32 +++++++- .../components/evaluators/ragas/utils.py | 79 +++++++++++++++++++ 2 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 integrations/ragas/src/haystack_integrations/components/evaluators/ragas/utils.py diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py index 2938670233..341591eba1 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py @@ -1,7 +1,7 @@ import inspect from typing import Any, Union, cast, get_args, get_origin -from haystack import Document, component +from haystack import Document, component, default_from_dict, default_to_dict from haystack.dataclasses import ChatMessage from pydantic import ValidationError @@ -9,6 +9,8 @@ from ragas.metrics.base import SimpleBaseMetric from ragas.metrics.result import MetricResult +from haystack_integrations.components.evaluators.ragas.utils import _deserialize_metric, _serialize_metric + @component class RagasEvaluator: @@ -70,6 +72,34 @@ def _validate_inputs(self, metrics: list[SimpleBaseMetric]) -> None: error_message = "All items in ragas_metrics must be instances of SimpleBaseMetric." raise TypeError(error_message) + def to_dict(self) -> dict[str, Any]: + """ + Serialize this component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict(self, ragas_metrics=[_serialize_metric(m) for m in self.metrics]) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "RagasEvaluator": + """ + Deserialize this component from a dictionary. + + Metrics are reconstructed from their stored class path and LLM/embedding + configuration. Only the `openai` provider is supported for automatic + deserialization; the API key is read from the `OPENAI_API_KEY` environment + variable at load time. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ + metrics_data = data.get("init_parameters", {}).get("ragas_metrics", []) + data["init_parameters"]["ragas_metrics"] = [_deserialize_metric(m) for m in metrics_data] + return default_from_dict(cls, data) + @component.output_types(result=dict) def run( self, diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/utils.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/utils.py new file mode 100644 index 0000000000..bea85b81cc --- /dev/null +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/utils.py @@ -0,0 +1,79 @@ +# SPDX-FileCopyrightText: 2026-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +import importlib +from typing import Any + +from openai import AsyncOpenAI +from ragas.embeddings.base import embedding_factory +from ragas.llms import llm_factory +from ragas.metrics.base import SimpleBaseMetric + + +def _serialize_metric(metric: SimpleBaseMetric) -> dict[str, Any]: + """ + Serialize a `SimpleBaseMetric` to a JSON-compatible dict. + + Stores the class path, metric name, and — when present — the LLM and + embeddings configuration (provider and model name). + + :param metric: The metric instance to serialize. + :returns: A dict suitable for storage in a pipeline YAML or `to_dict` output. + """ + metric_cls = type(metric) + serialized: dict[str, Any] = { + "type": f"{metric_cls.__module__}.{metric_cls.__qualname__}", + "name": metric.name, + } + llm = getattr(metric, "llm", None) + if llm is not None: + serialized["llm"] = {"model": llm.model, "provider": llm.provider} + embeddings = getattr(metric, "embeddings", None) + if embeddings is not None: + serialized["embeddings"] = {"model": embeddings.model, "provider": embeddings.PROVIDER_NAME} + return serialized + + +def _deserialize_metric(data: dict[str, Any]) -> SimpleBaseMetric: + """ + Reconstruct a `SimpleBaseMetric` from a serialized dict. + + Imports the metric class from the stored `type` path and rebuilds any LLM + or embeddings using the stored provider and model name. Only the `openai` + provider is supported for automatic reconstruction; the API key is read from + the `OPENAI_API_KEY` environment variable at deserialization time. + + :param data: Dict produced by `_serialize_metric`. + :returns: A fully constructed `SimpleBaseMetric` instance. + :raises ValueError: If a non-`openai` provider is encountered. + """ + type_path = data["type"] + module_path, class_name = type_path.rsplit(".", 1) + metric_cls = getattr(importlib.import_module(module_path), class_name) + + kwargs: dict[str, Any] = {} + + if "llm" in data: + llm_data = data["llm"] + if llm_data["provider"] != "openai": + msg = ( + f"Automatic deserialization only supports the 'openai' provider; " + f"got '{llm_data['provider']}'." + ) + raise ValueError(msg) + kwargs["llm"] = llm_factory(llm_data["model"], client=AsyncOpenAI()) + + if "embeddings" in data: + emb_data = data["embeddings"] + if emb_data["provider"] != "openai": + msg = ( + f"Automatic deserialization only supports the 'openai' provider; " + f"got '{emb_data['provider']}'." + ) + raise ValueError(msg) + kwargs["embeddings"] = embedding_factory("openai", model=emb_data["model"], client=AsyncOpenAI()) + + if "name" in data: + kwargs["name"] = data["name"] + + return metric_cls(**kwargs) From a728ca2a52aa989178142d64f4f92d132804ace4 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 22 Apr 2026 14:00:45 +0200 Subject: [PATCH 08/20] Refactor and add tests for serialization --- .../components/evaluators/ragas/evaluator.py | 3 +- .../components/evaluators/ragas/utils.py | 11 +- integrations/ragas/tests/test_evaluator.py | 88 +++++++++++--- integrations/ragas/tests/test_utils.py | 112 ++++++++++++++++++ 4 files changed, 185 insertions(+), 29 deletions(-) create mode 100644 integrations/ragas/tests/test_utils.py diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py index 341591eba1..bb3ea29c23 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py @@ -5,12 +5,11 @@ from haystack.dataclasses import ChatMessage from pydantic import ValidationError +from haystack_integrations.components.evaluators.ragas.utils import _deserialize_metric, _serialize_metric from ragas.dataset_schema import SingleTurnSample from ragas.metrics.base import SimpleBaseMetric from ragas.metrics.result import MetricResult -from haystack_integrations.components.evaluators.ragas.utils import _deserialize_metric, _serialize_metric - @component class RagasEvaluator: diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/utils.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/utils.py index bea85b81cc..2e25e2f1b3 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/utils.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/utils.py @@ -5,6 +5,7 @@ from typing import Any from openai import AsyncOpenAI + from ragas.embeddings.base import embedding_factory from ragas.llms import llm_factory from ragas.metrics.base import SimpleBaseMetric @@ -56,20 +57,14 @@ def _deserialize_metric(data: dict[str, Any]) -> SimpleBaseMetric: if "llm" in data: llm_data = data["llm"] if llm_data["provider"] != "openai": - msg = ( - f"Automatic deserialization only supports the 'openai' provider; " - f"got '{llm_data['provider']}'." - ) + msg = f"Automatic deserialization only supports the 'openai' provider; got '{llm_data['provider']}'." raise ValueError(msg) kwargs["llm"] = llm_factory(llm_data["model"], client=AsyncOpenAI()) if "embeddings" in data: emb_data = data["embeddings"] if emb_data["provider"] != "openai": - msg = ( - f"Automatic deserialization only supports the 'openai' provider; " - f"got '{emb_data['provider']}'." - ) + msg = f"Automatic deserialization only supports the 'openai' provider; got '{emb_data['provider']}'." raise ValueError(msg) kwargs["embeddings"] = embedding_factory("openai", model=emb_data["model"], client=AsyncOpenAI()) diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py index 0adfbcfe6a..2568639643 100644 --- a/integrations/ragas/tests/test_evaluator.py +++ b/integrations/ragas/tests/test_evaluator.py @@ -2,7 +2,7 @@ import pytest from openai import AsyncOpenAI -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch from ragas.embeddings.base import embedding_factory from ragas.llms import llm_factory from ragas.metrics.base import SimpleBaseMetric @@ -16,9 +16,10 @@ from haystack.dataclasses import ChatMessage from haystack.document_stores.in_memory import InMemoryDocumentStore from haystack_integrations.components.evaluators.ragas import RagasEvaluator +from tests.test_utils import ConcreteMetric, make_llm_mock -def _make_metric(name: str, score: float = 0.8, reason: str = "test reason") -> MagicMock: +def make_metric(name: str, score: float = 0.8, reason: str = "test reason") -> MagicMock: """Create a mock SimpleBaseMetric with a concrete ascore signature for inspect.signature.""" metric = MagicMock(spec=SimpleBaseMetric) metric.name = name @@ -33,12 +34,12 @@ async def ascore(user_input: str, response: str, retrieved_contexts: list) -> Me class TestInitialization: def test_successful_initialization(self): - metric = _make_metric("faithfulness") + metric = make_metric("faithfulness") evaluator = RagasEvaluator(ragas_metrics=[metric]) assert evaluator.metrics == [metric] def test_initialization_with_multiple_metrics(self): - metrics = [_make_metric("faithfulness"), _make_metric("answer_relevancy")] + metrics = [make_metric("faithfulness"), make_metric("answer_relevancy")] evaluator = RagasEvaluator(ragas_metrics=metrics) assert len(evaluator.metrics) == 2 @@ -47,14 +48,14 @@ def test_invalid_metrics_raises_type_error(self): RagasEvaluator(ragas_metrics=["not_a_metric"]) def test_invalid_metrics_mixed_raises_type_error(self): - valid = _make_metric("faithfulness") + valid = make_metric("faithfulness") with pytest.raises(TypeError): RagasEvaluator(ragas_metrics=[valid, "not_a_metric"]) class TestRunResultStructure: def test_run_returns_metric_results_keyed_by_name(self): - metric = _make_metric("faithfulness", score=0.9) + metric = make_metric("faithfulness", score=0.9) evaluator = RagasEvaluator(ragas_metrics=[metric]) output = evaluator.run( @@ -70,7 +71,7 @@ def test_run_returns_metric_results_keyed_by_name(self): assert result.value == 0.9 def test_run_scores_all_metrics(self): - metrics = [_make_metric("faithfulness", 0.9), _make_metric("answer_relevancy", 0.7)] + metrics = [make_metric("faithfulness", 0.9), make_metric("answer_relevancy", 0.7)] evaluator = RagasEvaluator(ragas_metrics=metrics) output = evaluator.run(query="test?", response="answer", documents=["doc"]) @@ -80,8 +81,8 @@ def test_run_scores_all_metrics(self): assert output["result"]["answer_relevancy"].value == 0.7 def test_run_calls_score_on_each_metric(self): - metric_a = _make_metric("faithfulness") - metric_b = _make_metric("answer_relevancy") + metric_a = make_metric("faithfulness") + metric_b = make_metric("answer_relevancy") evaluator = RagasEvaluator(ragas_metrics=[metric_a, metric_b]) evaluator.run(query="test?", response="answer", documents=["doc"]) @@ -108,7 +109,7 @@ async def ascore(user_input: str, response: str) -> MetricResult: metric.score.assert_called_once_with(user_input="test?", response="answer") def test_score_metric_omits_none_fields(self): - metric = _make_metric("faithfulness") + metric = make_metric("faithfulness") evaluator = RagasEvaluator(ragas_metrics=[metric]) evaluator.run(query="test?", response="answer") # no documents → retrieved_contexts=None @@ -119,7 +120,7 @@ def test_score_metric_omits_none_fields(self): class TestRunInputProcessing: def test_run_accepts_document_objects(self): - metric = _make_metric("faithfulness") + metric = make_metric("faithfulness") evaluator = RagasEvaluator(ragas_metrics=[metric]) evaluator.run( @@ -132,7 +133,7 @@ def test_run_accepts_document_objects(self): assert kwargs["retrieved_contexts"] == ["some content", "more content"] def test_run_accepts_string_documents(self): - metric = _make_metric("faithfulness") + metric = make_metric("faithfulness") evaluator = RagasEvaluator(ragas_metrics=[metric]) evaluator.run(query="test?", response="answer", documents=["doc one", "doc two"]) @@ -151,7 +152,7 @@ class TestRunInputValidation: ], ) def test_run_raises_on_invalid_input_types(self, invalid_input, field_name, error_message): - evaluator = RagasEvaluator(ragas_metrics=[_make_metric("faithfulness")]) + evaluator = RagasEvaluator(ragas_metrics=[make_metric("faithfulness")]) query = "Which is the most popular global sport?" documents = ["Football is the most popular sport."] response = "Football is the most popular sport in the world" @@ -167,17 +168,66 @@ def test_run_raises_on_invalid_input_types(self, invalid_input, field_name, erro assert error_message in str(exc_info.value) +class TestSerialization: + def test_to_dict_type_field(self): + evaluator = RagasEvaluator(ragas_metrics=[ConcreteMetric()]) + data = evaluator.to_dict() + assert data["type"] == "haystack_integrations.components.evaluators.ragas.evaluator.RagasEvaluator" + + def test_to_dict_serializes_metric(self): + metric = ConcreteMetric(name="test_metric", llm=make_llm_mock()) + data = RagasEvaluator(ragas_metrics=[metric]).to_dict() + serialized = data["init_parameters"]["ragas_metrics"][0] + assert serialized["name"] == "test_metric" + assert serialized["llm"] == {"model": "gpt-4o-mini", "provider": "openai"} + + def test_to_dict_serializes_multiple_metrics(self): + metrics = [ConcreteMetric(name="m1"), ConcreteMetric(name="m2")] + data = RagasEvaluator(ragas_metrics=metrics).to_dict() + names = [m["name"] for m in data["init_parameters"]["ragas_metrics"]] + assert names == ["m1", "m2"] + + def test_from_dict_reconstructs_evaluator(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test") + fake_llm = make_llm_mock() + evaluator = RagasEvaluator(ragas_metrics=[ConcreteMetric(name="concrete_metric", llm=fake_llm)]) + data = evaluator.to_dict() + + with patch("haystack_integrations.components.evaluators.ragas.utils.llm_factory", return_value=fake_llm): + reconstructed = RagasEvaluator.from_dict(data) + + assert len(reconstructed.metrics) == 1 + assert reconstructed.metrics[0].name == "concrete_metric" + + def test_from_dict_raises_for_unsupported_provider(self): + data = { + "type": "haystack_integrations.components.evaluators.ragas.evaluator.RagasEvaluator", + "init_parameters": { + "ragas_metrics": [ + { + "type": "tests.test_utils.ConcreteMetric", + "name": "some_metric", + "llm": {"model": "gemini-pro", "provider": "google"}, + } + ] + }, + } + + with pytest.raises(ValueError, match="only supports the 'openai' provider"): + RagasEvaluator.from_dict(data) + + @pytest.mark.skipif(not os.environ.get("OPENAI_API_KEY"), reason="Set OPENAI_API_KEY to run integration tests.") @pytest.mark.integration class TestStandaloneEvaluationIntegration: - def _make_llm(self): + def make_llm(self): return llm_factory("gpt-4o-mini", client=AsyncOpenAI()) - def _make_embeddings(self): + def make_embeddings(self): return embedding_factory("openai", model="text-embedding-3-small", client=AsyncOpenAI()) def test_faithfulness_returns_valid_score(self): - evaluator = RagasEvaluator(ragas_metrics=[Faithfulness(llm=self._make_llm())]) + evaluator = RagasEvaluator(ragas_metrics=[Faithfulness(llm=self.make_llm())]) output = evaluator.run( query="What makes Meta AI's LLaMA models stand out?", @@ -196,7 +246,7 @@ def test_faithfulness_returns_valid_score(self): def test_answer_relevancy_uses_only_query_and_response(self): """AnswerRelevancy only declares user_input + response in ascore — documents should not be forwarded.""" evaluator = RagasEvaluator( - ragas_metrics=[AnswerRelevancy(llm=self._make_llm(), embeddings=self._make_embeddings())] + ragas_metrics=[AnswerRelevancy(llm=self.make_llm(), embeddings=self.make_embeddings())] ) output = evaluator.run( @@ -210,8 +260,8 @@ def test_answer_relevancy_uses_only_query_and_response(self): assert 0.0 <= result.value <= 1.0 def test_multiple_metrics_all_return_results(self): - llm = self._make_llm() - embeddings = self._make_embeddings() + llm = self.make_llm() + embeddings = self.make_embeddings() evaluator = RagasEvaluator( ragas_metrics=[ Faithfulness(llm=llm), diff --git a/integrations/ragas/tests/test_utils.py b/integrations/ragas/tests/test_utils.py new file mode 100644 index 0000000000..476d0b7431 --- /dev/null +++ b/integrations/ragas/tests/test_utils.py @@ -0,0 +1,112 @@ +import pytest +from unittest.mock import MagicMock, patch +from ragas.metrics.base import SimpleBaseMetric +from ragas.metrics.result import MetricResult +from haystack_integrations.components.evaluators.ragas.utils import _deserialize_metric, _serialize_metric + + +class ConcreteMetric(SimpleBaseMetric): + """Minimal concrete SimpleBaseMetric for serialization tests.""" + + def __init__(self, name: str = "concrete_metric", llm=None, embeddings=None): + self.name = name + self.llm = llm + self.embeddings = embeddings + + async def ascore(self, user_input: str, response: str) -> MetricResult: + return MetricResult(value=1.0, reason="test") + + def score(self, **kwargs) -> MetricResult: + return MetricResult(value=1.0, reason="test") + + +def make_llm_mock(model: str = "gpt-4o-mini", provider: str = "openai") -> MagicMock: + llm = MagicMock() + llm.model = model + llm.provider = provider + return llm + + +def make_emb_mock(model: str = "text-embedding-3-small", provider: str = "openai") -> MagicMock: + emb = MagicMock() + emb.model = model + emb.PROVIDER_NAME = provider + return emb + + +class TestSerializeMetric: + def test_stores_type_path(self): + result = _serialize_metric(ConcreteMetric()) + assert "type" in result + assert result["type"].endswith(".ConcreteMetric") + + def test_stores_name(self): + result = _serialize_metric(ConcreteMetric(name="my_metric")) + assert result["name"] == "my_metric" + + def test_stores_llm(self): + metric = ConcreteMetric(llm=make_llm_mock("gpt-4o-mini", "openai")) + result = _serialize_metric(metric) + assert result["llm"] == {"model": "gpt-4o-mini", "provider": "openai"} + + def test_stores_embeddings(self): + metric = ConcreteMetric(embeddings=make_emb_mock("text-embedding-3-small", "openai")) + result = _serialize_metric(metric) + assert result["embeddings"] == {"model": "text-embedding-3-small", "provider": "openai"} + + def test_omits_llm_when_none(self): + assert "llm" not in _serialize_metric(ConcreteMetric()) + + def test_omits_embeddings_when_none(self): + assert "embeddings" not in _serialize_metric(ConcreteMetric()) + + +class TestDeserializeMetric: + def test_reconstructs_instance(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test") + fake_llm = make_llm_mock() + data = _serialize_metric(ConcreteMetric(name="concrete_metric", llm=fake_llm)) + + with patch("haystack_integrations.components.evaluators.ragas.utils.llm_factory", return_value=fake_llm): + result = _deserialize_metric(data) + + assert isinstance(result, ConcreteMetric) + assert result.name == "concrete_metric" + assert result.llm is fake_llm + + def test_reconstructs_embeddings(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test") + fake_emb = make_emb_mock() + data = _serialize_metric(ConcreteMetric(name="concrete_metric", embeddings=fake_emb)) + + with patch("haystack_integrations.components.evaluators.ragas.utils.embedding_factory", return_value=fake_emb): + result = _deserialize_metric(data) + + assert result.embeddings is fake_emb + + def test_raises_for_unsupported_llm_provider(self): + data = { + "type": "tests.test_utils.ConcreteMetric", + "name": "concrete_metric", + "llm": {"model": "gemini-pro", "provider": "google"}, + } + + with pytest.raises(ValueError, match="only supports the 'openai' provider"): + _deserialize_metric(data) + + def test_raises_for_unsupported_embeddings_provider(self): + data = { + "type": "tests.test_utils.ConcreteMetric", + "name": "concrete_metric", + "embeddings": {"model": "embedding-001", "provider": "google"}, + } + + with pytest.raises(ValueError, match="only supports the 'openai' provider"): + _deserialize_metric(data) + + def test_round_trip(self): + metric = ConcreteMetric(name="round_trip") + result = _deserialize_metric(_serialize_metric(metric)) + + assert isinstance(result, ConcreteMetric) + assert result.name == "round_trip" From 25834dcec77566f5cdcc40b244a335efc88ae607 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 22 Apr 2026 14:02:27 +0200 Subject: [PATCH 09/20] Update test --- integrations/ragas/tests/test_evaluator.py | 24 ++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py index 2568639643..5c2f68d6ec 100644 --- a/integrations/ragas/tests/test_evaluator.py +++ b/integrations/ragas/tests/test_evaluator.py @@ -16,9 +16,29 @@ from haystack.dataclasses import ChatMessage from haystack.document_stores.in_memory import InMemoryDocumentStore from haystack_integrations.components.evaluators.ragas import RagasEvaluator -from tests.test_utils import ConcreteMetric, make_llm_mock +class ConcreteMetric(SimpleBaseMetric): + """Minimal concrete SimpleBaseMetric for serialization tests.""" + + def __init__(self, name: str = "concrete_metric", llm=None, embeddings=None): + self.name = name + self.llm = llm + self.embeddings = embeddings + + async def ascore(self, user_input: str, response: str) -> MetricResult: + return MetricResult(value=1.0, reason="test") + + def score(self, **kwargs) -> MetricResult: + return MetricResult(value=1.0, reason="test") + + +def make_llm_mock(model: str = "gpt-4o-mini", provider: str = "openai") -> MagicMock: + llm = MagicMock() + llm.model = model + llm.provider = provider + return llm + def make_metric(name: str, score: float = 0.8, reason: str = "test reason") -> MagicMock: """Create a mock SimpleBaseMetric with a concrete ascore signature for inspect.signature.""" metric = MagicMock(spec=SimpleBaseMetric) @@ -205,7 +225,7 @@ def test_from_dict_raises_for_unsupported_provider(self): "init_parameters": { "ragas_metrics": [ { - "type": "tests.test_utils.ConcreteMetric", + "type": "tests.test_evaluator.ConcreteMetric", "name": "some_metric", "llm": {"model": "gemini-pro", "provider": "google"}, } From 5598b58418812a5379d8e7d3f803eb0c36c1659c Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 22 Apr 2026 14:07:59 +0200 Subject: [PATCH 10/20] Refactoring --- integrations/ragas/tests/test_evaluator.py | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py index 5c2f68d6ec..070e339dac 100644 --- a/integrations/ragas/tests/test_evaluator.py +++ b/integrations/ragas/tests/test_evaluator.py @@ -73,17 +73,15 @@ def test_invalid_metrics_mixed_raises_type_error(self): RagasEvaluator(ragas_metrics=[valid, "not_a_metric"]) -class TestRunResultStructure: +class TestRun: def test_run_returns_metric_results_keyed_by_name(self): metric = make_metric("faithfulness", score=0.9) evaluator = RagasEvaluator(ragas_metrics=[metric]) - output = evaluator.run( query="Which is the most popular global sport?", response="Football is the most popular sport.", documents=["Football is undoubtedly the world's most popular sport."], ) - assert "result" in output assert "faithfulness" in output["result"] result = output["result"]["faithfulness"] @@ -93,9 +91,7 @@ def test_run_returns_metric_results_keyed_by_name(self): def test_run_scores_all_metrics(self): metrics = [make_metric("faithfulness", 0.9), make_metric("answer_relevancy", 0.7)] evaluator = RagasEvaluator(ragas_metrics=metrics) - output = evaluator.run(query="test?", response="answer", documents=["doc"]) - assert set(output["result"].keys()) == {"faithfulness", "answer_relevancy"} assert output["result"]["faithfulness"].value == 0.9 assert output["result"]["answer_relevancy"].value == 0.7 @@ -104,14 +100,10 @@ def test_run_calls_score_on_each_metric(self): metric_a = make_metric("faithfulness") metric_b = make_metric("answer_relevancy") evaluator = RagasEvaluator(ragas_metrics=[metric_a, metric_b]) - evaluator.run(query="test?", response="answer", documents=["doc"]) - metric_a.score.assert_called_once() metric_b.score.assert_called_once() - -class TestRunParameterFiltering: def test_score_metric_passes_only_matching_params(self): """Metric that only needs user_input + response should not receive retrieved_contexts.""" metric = MagicMock(spec=SimpleBaseMetric) @@ -125,44 +117,33 @@ async def ascore(user_input: str, response: str) -> MetricResult: evaluator = RagasEvaluator(ragas_metrics=[metric]) evaluator.run(query="test?", response="answer", documents=["doc"], reference="ref") - metric.score.assert_called_once_with(user_input="test?", response="answer") def test_score_metric_omits_none_fields(self): metric = make_metric("faithfulness") evaluator = RagasEvaluator(ragas_metrics=[metric]) - evaluator.run(query="test?", response="answer") # no documents → retrieved_contexts=None - _, kwargs = metric.score.call_args assert "retrieved_contexts" not in kwargs - -class TestRunInputProcessing: def test_run_accepts_document_objects(self): metric = make_metric("faithfulness") evaluator = RagasEvaluator(ragas_metrics=[metric]) - evaluator.run( query="test?", response="answer", documents=[Document(content="some content"), Document(content="more content")], ) - _, kwargs = metric.score.call_args assert kwargs["retrieved_contexts"] == ["some content", "more content"] def test_run_accepts_string_documents(self): metric = make_metric("faithfulness") evaluator = RagasEvaluator(ragas_metrics=[metric]) - evaluator.run(query="test?", response="answer", documents=["doc one", "doc two"]) - _, kwargs = metric.score.call_args assert kwargs["retrieved_contexts"] == ["doc one", "doc two"] - -class TestRunInputValidation: @pytest.mark.parametrize( "invalid_input,field_name,error_message", [ From a0dd1a322185d6ed1479228dcc49e6a5971c97d1 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 22 Apr 2026 14:11:57 +0200 Subject: [PATCH 11/20] consolidate tests --- integrations/ragas/tests/test_evaluator.py | 73 ++++++++++++---------- integrations/ragas/tests/test_utils.py | 4 +- 2 files changed, 44 insertions(+), 33 deletions(-) diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py index 070e339dac..ceea4895a9 100644 --- a/integrations/ragas/tests/test_evaluator.py +++ b/integrations/ragas/tests/test_evaluator.py @@ -1,13 +1,7 @@ import os +from unittest.mock import MagicMock, patch import pytest -from openai import AsyncOpenAI -from unittest.mock import MagicMock, patch -from ragas.embeddings.base import embedding_factory -from ragas.llms import llm_factory -from ragas.metrics.base import SimpleBaseMetric -from ragas.metrics.collections import AnswerRelevancy, ContextPrecision, Faithfulness -from ragas.metrics.result import MetricResult from haystack import Document, Pipeline from haystack.components.builders import AnswerBuilder, ChatPromptBuilder from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder @@ -15,6 +9,13 @@ from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever from haystack.dataclasses import ChatMessage from haystack.document_stores.in_memory import InMemoryDocumentStore +from openai import AsyncOpenAI +from ragas.embeddings.base import embedding_factory +from ragas.llms import llm_factory +from ragas.metrics.base import SimpleBaseMetric +from ragas.metrics.collections import AnswerRelevancy, ContextPrecision, Faithfulness +from ragas.metrics.result import MetricResult + from haystack_integrations.components.evaluators.ragas import RagasEvaluator @@ -39,6 +40,7 @@ def make_llm_mock(model: str = "gpt-4o-mini", provider: str = "openai") -> Magic llm.provider = provider return llm + def make_metric(name: str, score: float = 0.8, reason: str = "test reason") -> MagicMock: """Create a mock SimpleBaseMetric with a concrete ascore signature for inspect.signature.""" metric = MagicMock(spec=SimpleBaseMetric) @@ -170,33 +172,40 @@ def test_run_raises_on_invalid_input_types(self, invalid_input, field_name, erro class TestSerialization: - def test_to_dict_type_field(self): - evaluator = RagasEvaluator(ragas_metrics=[ConcreteMetric()]) - data = evaluator.to_dict() - assert data["type"] == "haystack_integrations.components.evaluators.ragas.evaluator.RagasEvaluator" - - def test_to_dict_serializes_metric(self): - metric = ConcreteMetric(name="test_metric", llm=make_llm_mock()) - data = RagasEvaluator(ragas_metrics=[metric]).to_dict() - serialized = data["init_parameters"]["ragas_metrics"][0] - assert serialized["name"] == "test_metric" - assert serialized["llm"] == {"model": "gpt-4o-mini", "provider": "openai"} - - def test_to_dict_serializes_multiple_metrics(self): - metrics = [ConcreteMetric(name="m1"), ConcreteMetric(name="m2")] - data = RagasEvaluator(ragas_metrics=metrics).to_dict() - names = [m["name"] for m in data["init_parameters"]["ragas_metrics"]] - assert names == ["m1", "m2"] - - def test_from_dict_reconstructs_evaluator(self, monkeypatch): - monkeypatch.setenv("OPENAI_API_KEY", "test") - fake_llm = make_llm_mock() - evaluator = RagasEvaluator(ragas_metrics=[ConcreteMetric(name="concrete_metric", llm=fake_llm)]) + def test_to_dict(self): + evaluator = RagasEvaluator( + ragas_metrics=[ConcreteMetric(llm=make_llm_mock()), ConcreteMetric(name="another_metric")] + ) data = evaluator.to_dict() + assert data == { + "type": "haystack_integrations.components.evaluators.ragas.evaluator.RagasEvaluator", + "init_parameters": { + "ragas_metrics": [ + { + "type": "tests.test_evaluator.ConcreteMetric", + "name": "concrete_metric", + "llm": {"model": "gpt-4o-mini", "provider": "openai"}, + }, + {"type": "tests.test_evaluator.ConcreteMetric", "name": "another_metric"}, + ] + }, + } - with patch("haystack_integrations.components.evaluators.ragas.utils.llm_factory", return_value=fake_llm): - reconstructed = RagasEvaluator.from_dict(data) - + def test_from_dict(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test") + data = { + "type": "haystack_integrations.components.evaluators.ragas.evaluator.RagasEvaluator", + "init_parameters": { + "ragas_metrics": [ + { + "type": "tests.test_evaluator.ConcreteMetric", + "name": "concrete_metric", + "llm": {"model": "gpt-4o-mini", "provider": "openai"}, + }, + ], + }, + } + reconstructed = RagasEvaluator.from_dict(data) assert len(reconstructed.metrics) == 1 assert reconstructed.metrics[0].name == "concrete_metric" diff --git a/integrations/ragas/tests/test_utils.py b/integrations/ragas/tests/test_utils.py index 476d0b7431..f19349607b 100644 --- a/integrations/ragas/tests/test_utils.py +++ b/integrations/ragas/tests/test_utils.py @@ -1,7 +1,9 @@ -import pytest from unittest.mock import MagicMock, patch + +import pytest from ragas.metrics.base import SimpleBaseMetric from ragas.metrics.result import MetricResult + from haystack_integrations.components.evaluators.ragas.utils import _deserialize_metric, _serialize_metric From f1fd4ecd09a8f2ce55811a40f92eead69c7f0138 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 22 Apr 2026 15:11:51 +0200 Subject: [PATCH 12/20] refactoring --- integrations/ragas/tests/test_evaluator.py | 43 ++++++++++------------ 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py index ceea4895a9..7f2c568143 100644 --- a/integrations/ragas/tests/test_evaluator.py +++ b/integrations/ragas/tests/test_evaluator.py @@ -1,5 +1,5 @@ import os -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock import pytest from haystack import Document, Pipeline @@ -34,13 +34,6 @@ def score(self, **kwargs) -> MetricResult: return MetricResult(value=1.0, reason="test") -def make_llm_mock(model: str = "gpt-4o-mini", provider: str = "openai") -> MagicMock: - llm = MagicMock() - llm.model = model - llm.provider = provider - return llm - - def make_metric(name: str, score: float = 0.8, reason: str = "test reason") -> MagicMock: """Create a mock SimpleBaseMetric with a concrete ascore signature for inspect.signature.""" metric = MagicMock(spec=SimpleBaseMetric) @@ -54,14 +47,23 @@ async def ascore(user_input: str, response: str, retrieved_contexts: list) -> Me return metric -class TestInitialization: - def test_successful_initialization(self): - metric = make_metric("faithfulness") +class TestInit: + def test_init(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test") + metric = Faithfulness(llm=llm_factory("gpt-4o-mini", client=AsyncOpenAI())) evaluator = RagasEvaluator(ragas_metrics=[metric]) assert evaluator.metrics == [metric] - def test_initialization_with_multiple_metrics(self): - metrics = [make_metric("faithfulness"), make_metric("answer_relevancy")] + def test_init_with_multiple_metrics(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test") + llm = llm_factory("gpt-4o-mini", client=AsyncOpenAI()) + metrics = [ + Faithfulness(llm=llm), + AnswerRelevancy( + llm=llm, + embeddings=embedding_factory("openai", model="text-embedding-3-small", client=AsyncOpenAI()) + ), + ] evaluator = RagasEvaluator(ragas_metrics=metrics) assert len(evaluator.metrics) == 2 @@ -69,14 +71,9 @@ def test_invalid_metrics_raises_type_error(self): with pytest.raises(TypeError, match="All items in ragas_metrics must be instances of SimpleBaseMetric."): RagasEvaluator(ragas_metrics=["not_a_metric"]) - def test_invalid_metrics_mixed_raises_type_error(self): - valid = make_metric("faithfulness") - with pytest.raises(TypeError): - RagasEvaluator(ragas_metrics=[valid, "not_a_metric"]) - class TestRun: - def test_run_returns_metric_results_keyed_by_name(self): + def test_run_returns_result_by_metric_name(self, monkeypatch): metric = make_metric("faithfulness", score=0.9) evaluator = RagasEvaluator(ragas_metrics=[metric]) output = evaluator.run( @@ -172,10 +169,10 @@ def test_run_raises_on_invalid_input_types(self, invalid_input, field_name, erro class TestSerialization: - def test_to_dict(self): - evaluator = RagasEvaluator( - ragas_metrics=[ConcreteMetric(llm=make_llm_mock()), ConcreteMetric(name="another_metric")] - ) + def test_to_dict(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test") + llm = llm_factory("gpt-4o-mini", client=AsyncOpenAI()) + evaluator = RagasEvaluator(ragas_metrics=[ConcreteMetric(llm=llm), ConcreteMetric(name="another_metric")]) data = evaluator.to_dict() assert data == { "type": "haystack_integrations.components.evaluators.ragas.evaluator.RagasEvaluator", From c721058b7686570aeda96752bda6e5f21f539c84 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 22 Apr 2026 15:22:48 +0200 Subject: [PATCH 13/20] define run method signature better --- .../components/evaluators/ragas/evaluator.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py index bb3ea29c23..f2d959ff2b 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py @@ -99,7 +99,7 @@ def from_dict(cls, data: dict[str, Any]) -> "RagasEvaluator": data["init_parameters"]["ragas_metrics"] = [_deserialize_metric(m) for m in metrics_data] return default_from_dict(cls, data) - @component.output_types(result=dict) + @component.output_types(result=dict[str, MetricResult]) def run( self, query: str | None = None, @@ -109,7 +109,7 @@ def run( multi_responses: list[str] | None = None, reference: str | None = None, rubrics: dict[str, str] | None = None, - ) -> dict[str, Any]: + ) -> dict[str, MetricResult]: """ Evaluates the provided inputs against each metric and returns the results. @@ -136,7 +136,7 @@ def run( reference=reference, rubrics=rubrics, ) - except (ValueError, ValidationError) as e: + except ValidationError as e: self._handle_conversion_error(e) results: dict[str, MetricResult] = {} @@ -208,10 +208,7 @@ def _handle_conversion_error(self, error: Exception) -> None: :params error: Original error. """ if isinstance(error, ValidationError): - field_mapping = { - "user_input": "query", - "retrieved_contexts": "documents", - } + field_mapping = {"user_input": "query", "retrieved_contexts": "documents"} for err in error.errors(): # loc is a tuple of strings and ints but according to pydantic docs, the first element is a string # https://docs.pydantic.dev/latest/errors/errors/ @@ -257,7 +254,7 @@ def _get_example_input(self, field: str) -> str: "query": "A string query like 'Question?'", "documents": "[Document(content='Example content')]", "reference_contexts": "['Example string 1', 'Example string 2']", - "response": "ChatMessage(_content='Hi', _role='assistant')", + "response": "ChatMessage.from_assistant('Hi')", "multi_responses": "['Response 1', 'Response 2']", "reference": "'A reference string'", "rubrics": "{'score1': 'high_similarity'}", From 7c843b87f58d3833739445253286b2a845523a22 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 22 Apr 2026 15:27:05 +0200 Subject: [PATCH 14/20] Fix types --- .../components/evaluators/ragas/evaluator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py index f2d959ff2b..1fdb676a4e 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py @@ -99,7 +99,7 @@ def from_dict(cls, data: dict[str, Any]) -> "RagasEvaluator": data["init_parameters"]["ragas_metrics"] = [_deserialize_metric(m) for m in metrics_data] return default_from_dict(cls, data) - @component.output_types(result=dict[str, MetricResult]) + @component.output_types(result=dict[str, dict[str, MetricResult]]) def run( self, query: str | None = None, @@ -109,7 +109,7 @@ def run( multi_responses: list[str] | None = None, reference: str | None = None, rubrics: dict[str, str] | None = None, - ) -> dict[str, MetricResult]: + ) -> dict[str, dict[str, MetricResult]]: """ Evaluates the provided inputs against each metric and returns the results. From ba638384447ad1b4d675ce0ca564def04003f0cd Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Thu, 23 Apr 2026 10:00:51 +0200 Subject: [PATCH 15/20] update pin --- integrations/ragas/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/ragas/pyproject.toml b/integrations/ragas/pyproject.toml index 0d81a347f4..e5349de1b2 100644 --- a/integrations/ragas/pyproject.toml +++ b/integrations/ragas/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai>=2.22.0", "ragas>=0.4.0"] +dependencies = ["haystack-ai>=2.22.0", "ragas>=0.4.3"] [project.urls] Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/ragas" From 55bbdd555fb07ea1d7f2185af8bca9c67c366f1a Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Thu, 23 Apr 2026 10:37:42 +0200 Subject: [PATCH 16/20] fix docstring --- .../components/evaluators/ragas/evaluator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py index 1fdb676a4e..33802b1a13 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py @@ -121,7 +121,7 @@ def run( :param reference: A string reference answer for the query. :param rubrics: A dictionary of evaluation rubric, where keys represent the score and the values represent the corresponding evaluation criteria. - :return: A dictionary with key ``result`` mapping metric names to their `MetricResult`. + :return: A dictionary with key `result` mapping metric names to their `MetricResult`. """ processed_docs = self._process_documents(documents) processed_response = self._process_response(response) From 6753caa497408adf8d488fd4dfa74d68aaf581ea Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Thu, 23 Apr 2026 10:41:12 +0200 Subject: [PATCH 17/20] add integration tests to workflow --- .github/workflows/ragas.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ragas.yml b/.github/workflows/ragas.yml index 4446f4964c..f7cd8da01f 100644 --- a/.github/workflows/ragas.yml +++ b/.github/workflows/ragas.yml @@ -103,7 +103,8 @@ jobs: name: coverage-comment-ragas path: python-coverage-comment-action-ragas.txt - # No integration tests yet — add integration-cov-append-retry + combined coverage step when needed + - name: Run integration tests + run: hatch run test:integration-cov-append-retry - name: Run unit tests with lowest direct dependencies if: github.event_name != 'push' From 5c1858a13a2c71907a14caeb17ab0656f562dd20 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Thu, 23 Apr 2026 10:49:59 +0200 Subject: [PATCH 18/20] simplify tests --- integrations/ragas/tests/test_evaluator.py | 3 +- integrations/ragas/tests/test_utils.py | 84 +++++++--------------- 2 files changed, 28 insertions(+), 59 deletions(-) diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py index 7f2c568143..fd6197944b 100644 --- a/integrations/ragas/tests/test_evaluator.py +++ b/integrations/ragas/tests/test_evaluator.py @@ -60,8 +60,7 @@ def test_init_with_multiple_metrics(self, monkeypatch): metrics = [ Faithfulness(llm=llm), AnswerRelevancy( - llm=llm, - embeddings=embedding_factory("openai", model="text-embedding-3-small", client=AsyncOpenAI()) + llm=llm, embeddings=embedding_factory("openai", model="text-embedding-3-small", client=AsyncOpenAI()) ), ] evaluator = RagasEvaluator(ragas_metrics=metrics) diff --git a/integrations/ragas/tests/test_utils.py b/integrations/ragas/tests/test_utils.py index f19349607b..aeaddf73fe 100644 --- a/integrations/ragas/tests/test_utils.py +++ b/integrations/ragas/tests/test_utils.py @@ -1,6 +1,9 @@ from unittest.mock import MagicMock, patch import pytest +from openai import AsyncOpenAI +from ragas.embeddings.base import embedding_factory +from ragas.llms import llm_factory from ragas.metrics.base import SimpleBaseMetric from ragas.metrics.result import MetricResult @@ -22,69 +25,36 @@ def score(self, **kwargs) -> MetricResult: return MetricResult(value=1.0, reason="test") -def make_llm_mock(model: str = "gpt-4o-mini", provider: str = "openai") -> MagicMock: - llm = MagicMock() - llm.model = model - llm.provider = provider - return llm - - -def make_emb_mock(model: str = "text-embedding-3-small", provider: str = "openai") -> MagicMock: - emb = MagicMock() - emb.model = model - emb.PROVIDER_NAME = provider - return emb - - -class TestSerializeMetric: - def test_stores_type_path(self): - result = _serialize_metric(ConcreteMetric()) - assert "type" in result - assert result["type"].endswith(".ConcreteMetric") - - def test_stores_name(self): - result = _serialize_metric(ConcreteMetric(name="my_metric")) - assert result["name"] == "my_metric" - - def test_stores_llm(self): - metric = ConcreteMetric(llm=make_llm_mock("gpt-4o-mini", "openai")) - result = _serialize_metric(metric) - assert result["llm"] == {"model": "gpt-4o-mini", "provider": "openai"} - - def test_stores_embeddings(self): - metric = ConcreteMetric(embeddings=make_emb_mock("text-embedding-3-small", "openai")) - result = _serialize_metric(metric) - assert result["embeddings"] == {"model": "text-embedding-3-small", "provider": "openai"} - - def test_omits_llm_when_none(self): - assert "llm" not in _serialize_metric(ConcreteMetric()) - - def test_omits_embeddings_when_none(self): - assert "embeddings" not in _serialize_metric(ConcreteMetric()) +def test_serialization(monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test") + result = _serialize_metric( + ConcreteMetric( + llm=llm_factory("gpt-4o-mini", client=AsyncOpenAI()), + embeddings=embedding_factory("openai", model="text-embedding-3-small", client=AsyncOpenAI()), + ) + ) + assert result == { + "type": "tests.test_utils.ConcreteMetric", + "name": "concrete_metric", + "llm": {"model": "gpt-4o-mini", "provider": "openai"}, + "embeddings": {"model": "text-embedding-3-small", "provider": "openai"}, + } class TestDeserializeMetric: - def test_reconstructs_instance(self, monkeypatch): + def test_deserialization(self, monkeypatch): monkeypatch.setenv("OPENAI_API_KEY", "test") - fake_llm = make_llm_mock() - data = _serialize_metric(ConcreteMetric(name="concrete_metric", llm=fake_llm)) - - with patch("haystack_integrations.components.evaluators.ragas.utils.llm_factory", return_value=fake_llm): - result = _deserialize_metric(data) - + data = { + "type": "tests.test_utils.ConcreteMetric", + "name": "concrete_metric", + "llm": {"model": "gpt-4o-mini", "provider": "openai"}, + "embeddings": {"model": "text-embedding-3-small", "provider": "openai"}, + } + result = _deserialize_metric(data) assert isinstance(result, ConcreteMetric) assert result.name == "concrete_metric" - assert result.llm is fake_llm - - def test_reconstructs_embeddings(self, monkeypatch): - monkeypatch.setenv("OPENAI_API_KEY", "test") - fake_emb = make_emb_mock() - data = _serialize_metric(ConcreteMetric(name="concrete_metric", embeddings=fake_emb)) - - with patch("haystack_integrations.components.evaluators.ragas.utils.embedding_factory", return_value=fake_emb): - result = _deserialize_metric(data) - - assert result.embeddings is fake_emb + assert result.llm.model == "gpt-4o-mini" + assert result.embeddings.model == "text-embedding-3-small" def test_raises_for_unsupported_llm_provider(self): data = { From 29c1d20161282ee9270af046fa9bd29ee912fe2c Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Thu, 23 Apr 2026 10:50:16 +0200 Subject: [PATCH 19/20] remove unused import --- integrations/ragas/tests/test_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/integrations/ragas/tests/test_utils.py b/integrations/ragas/tests/test_utils.py index aeaddf73fe..d45b7999b1 100644 --- a/integrations/ragas/tests/test_utils.py +++ b/integrations/ragas/tests/test_utils.py @@ -1,5 +1,3 @@ -from unittest.mock import MagicMock, patch - import pytest from openai import AsyncOpenAI from ragas.embeddings.base import embedding_factory From 5ffa9e03fb9b65c597957df1cc5aff2aebd79431 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Thu, 23 Apr 2026 13:29:39 +0200 Subject: [PATCH 20/20] PR comments --- .../components/evaluators/ragas/evaluator.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py index 33802b1a13..7672a03b20 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py @@ -56,11 +56,14 @@ def __init__(self, ragas_metrics: list[SimpleBaseMetric]) -> None: :param ragas_metrics: A list of modern Ragas metrics from `ragas.metrics.collections`. Each metric must be fully configured (including its LLM) at construction time. + Available metrics can be found in the + [Ragas documentation](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/). """ self._validate_inputs(ragas_metrics) self.metrics = ragas_metrics - def _validate_inputs(self, metrics: list[SimpleBaseMetric]) -> None: + @staticmethod + def _validate_inputs(metrics: list[SimpleBaseMetric]) -> None: """ Validate input parameters.