diff --git a/.github/workflows/ragas.yml b/.github/workflows/ragas.yml index 4446f4964c..f7cd8da01f 100644 --- a/.github/workflows/ragas.yml +++ b/.github/workflows/ragas.yml @@ -103,7 +103,8 @@ jobs: name: coverage-comment-ragas path: python-coverage-comment-action-ragas.txt - # No integration tests yet — add integration-cov-append-retry + combined coverage step when needed + - name: Run integration tests + run: hatch run test:integration-cov-append-retry - name: Run unit tests with lowest direct dependencies if: github.event_name != 'push' diff --git a/integrations/ragas/pyproject.toml b/integrations/ragas/pyproject.toml index cfe2c9c0e7..e5349de1b2 100644 --- a/integrations/ragas/pyproject.toml +++ b/integrations/ragas/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai>=2.22.0", "ragas>=0.2.6,<0.3.0"] +dependencies = ["haystack-ai>=2.22.0", "ragas>=0.4.3"] [project.urls] Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/ragas" @@ -164,3 +164,10 @@ parallel = false omit = ["*/tests/*", "*/__init__.py"] show_missing = true exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] + +[tool.pytest.ini_options] +addopts = "--strict-markers" +markers = [ + "integration: integration tests", +] +log_cli = true diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py index addb042807..7672a03b20 100644 --- a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/evaluator.py @@ -1,19 +1,14 @@ -import re +import inspect from typing import Any, Union, cast, get_args, get_origin -from haystack import Document, component +from haystack import Document, component, default_from_dict, default_to_dict from haystack.dataclasses import ChatMessage from pydantic import ValidationError -from ragas import evaluate -from ragas.dataset_schema import ( - EvaluationDataset, - EvaluationResult, - SingleTurnSample, -) -from ragas.embeddings import BaseRagasEmbeddings -from ragas.llms import BaseRagasLLM -from ragas.metrics import Metric +from haystack_integrations.components.evaluators.ragas.utils import _deserialize_metric, _serialize_metric +from ragas.dataset_schema import SingleTurnSample +from ragas.metrics.base import SimpleBaseMetric +from ragas.metrics.result import MetricResult @component @@ -23,19 +18,21 @@ class RagasEvaluator: See the [Ragas framework](https://docs.ragas.io/) for more details. + This component supports the modern Ragas metrics API (`ragas.metrics.collections`). + Each metric must be a `SimpleBaseMetric` instance with its LLM configured at construction time. + Usage example: ```python - from haystack.components.generators import OpenAIGenerator + from openai import AsyncOpenAI + from ragas.llms import llm_factory + from ragas.metrics.collections import Faithfulness from haystack_integrations.components.evaluators.ragas import RagasEvaluator - from ragas.metrics import ContextPrecision - from ragas.llms import HaystackLLMWrapper - llm = OpenAIGenerator(model="gpt-4o-mini") - evaluator_llm = HaystackLLMWrapper(llm) + client = AsyncOpenAI() + llm = llm_factory("gpt-4o-mini", client=client) evaluator = RagasEvaluator( - ragas_metrics=[ContextPrecision()], - evaluator_llm=evaluator_llm + ragas_metrics=[Faithfulness(llm=llm)], ) output = evaluator.run( query="Which is the most popular global sport?", @@ -53,52 +50,59 @@ class RagasEvaluator: ``` """ - def __init__( - self, - ragas_metrics: list[Metric], - evaluator_llm: BaseRagasLLM | None = None, - evaluator_embedding: BaseRagasEmbeddings | None = None, - ) -> None: + def __init__(self, ragas_metrics: list[SimpleBaseMetric]) -> None: """ Constructs a new Ragas evaluator. - :param ragas_metrics: A list of evaluation metrics from the [Ragas](https://docs.ragas.io/) library. - :param evaluator_llm: A language model used by metrics that require LLMs for evaluation. - :param evaluator_embedding: An embedding model used by metrics that require embeddings for evaluation. + :param ragas_metrics: A list of modern Ragas metrics from `ragas.metrics.collections`. + Each metric must be fully configured (including its LLM) at construction time. + Available metrics can be found in the + [Ragas documentation](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/). """ - self._validate_inputs(ragas_metrics, evaluator_llm, evaluator_embedding) + self._validate_inputs(ragas_metrics) self.metrics = ragas_metrics - self.llm = evaluator_llm - self.embedding = evaluator_embedding - def _validate_inputs( - self, - metrics: list[Metric], - llm: BaseRagasLLM | None, - embedding: BaseRagasEmbeddings | None, - ) -> None: + @staticmethod + def _validate_inputs(metrics: list[SimpleBaseMetric]) -> None: """ Validate input parameters. - :param metrics: List of Ragas metrics to validate - :param llm: Language model to validate - :param embedding: Embedding model to validate - + :param metrics: List of Ragas metrics to validate. :return: None. """ - if not all(isinstance(metric, Metric) for metric in metrics): - error_message = "All items in ragas_metrics must be instances of Metric class." + if not all(isinstance(metric, SimpleBaseMetric) for metric in metrics): + error_message = "All items in ragas_metrics must be instances of SimpleBaseMetric." raise TypeError(error_message) - if llm is not None and not isinstance(llm, BaseRagasLLM): - error_message = f"Expected evaluator_llm to be BaseRagasLLM, got {type(llm).__name__}" - raise TypeError(error_message) + def to_dict(self) -> dict[str, Any]: + """ + Serialize this component to a dictionary. - if embedding is not None and not isinstance(embedding, BaseRagasEmbeddings): - error_message = f"Expected evaluator_embedding to be BaseRagasEmbeddings, got {type(embedding).__name__}" - raise TypeError(error_message) + :returns: + Dictionary with serialized data. + """ + return default_to_dict(self, ragas_metrics=[_serialize_metric(m) for m in self.metrics]) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "RagasEvaluator": + """ + Deserialize this component from a dictionary. + + Metrics are reconstructed from their stored class path and LLM/embedding + configuration. Only the `openai` provider is supported for automatic + deserialization; the API key is read from the `OPENAI_API_KEY` environment + variable at load time. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ + metrics_data = data.get("init_parameters", {}).get("ragas_metrics", []) + data["init_parameters"]["ragas_metrics"] = [_deserialize_metric(m) for m in metrics_data] + return default_from_dict(cls, data) - @component.output_types(result=EvaluationResult) + @component.output_types(result=dict[str, dict[str, MetricResult]]) def run( self, query: str | None = None, @@ -108,9 +112,9 @@ def run( multi_responses: list[str] | None = None, reference: str | None = None, rubrics: dict[str, str] | None = None, - ) -> dict[str, Any]: + ) -> dict[str, dict[str, MetricResult]]: """ - Evaluates the provided query against the documents and returns the evaluation result. + Evaluates the provided inputs against each metric and returns the results. :param query: The input query from the user. :param response: A list of ChatMessage responses (typically from a language model or agent). @@ -120,7 +124,7 @@ def run( :param reference: A string reference answer for the query. :param rubrics: A dictionary of evaluation rubric, where keys represent the score and the values represent the corresponding evaluation criteria. - :return: A dictionary containing the evaluation result. + :return: A dictionary with key `result` mapping metric names to their `MetricResult`. """ processed_docs = self._process_documents(documents) processed_response = self._process_response(response) @@ -135,30 +139,41 @@ def run( reference=reference, rubrics=rubrics, ) - - except (ValueError, ValidationError) as e: + except ValidationError as e: self._handle_conversion_error(e) - dataset = EvaluationDataset([sample]) + results: dict[str, MetricResult] = {} + for metric in self.metrics: + results[metric.name] = self._score_metric(metric, sample) - try: - result = evaluate( - dataset=dataset, - metrics=self.metrics, - llm=self.llm, - embeddings=self.embedding, - ) - except (ValueError, ValidationError) as e: - self._handle_evaluation_error(e) + return {"result": results} - return {"result": result} + def _score_metric(self, metric: SimpleBaseMetric, sample: SingleTurnSample) -> MetricResult: + """ + Score a metric by inspecting its ascore() signature and passing only matching sample fields. + + :param metric: A SimpleBaseMetric instance to score. + :param sample: The SingleTurnSample holding all available input fields. + :return: MetricResult from the metric. + """ + sig = inspect.signature(metric.ascore) + excluded = {"self", "callbacks"} + valid_params = { + name + for name, param in sig.parameters.items() + if name not in excluded + and param.kind not in (inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD) + } + sample_dict = sample.model_dump() + kwargs = {k: v for k, v in sample_dict.items() if k in valid_params and v is not None} + return metric.score(**kwargs) def _process_documents(self, documents: list[Document | str] | None) -> list[str] | None: """ Process and validate input documents. - :param documents: List of Documents or strings to process - :return: List of document contents as strings or None + :param documents: List of Documents or strings to process. + :return: List of document contents as strings or None. """ if documents is None: return None @@ -178,10 +193,10 @@ def _process_response(self, response: list[ChatMessage] | str | None) -> str | N """ Process response into expected format. - :param response: Response to process - :return: None or Processed response string + :param response: Response to process. + :return: None or processed response string. """ - if isinstance(response, list): # Check if response is a list + if isinstance(response, list): if all(isinstance(item, ChatMessage) and item.text for item in response): return response[0].text return None @@ -191,15 +206,12 @@ def _process_response(self, response: list[ChatMessage] | str | None) -> str | N def _handle_conversion_error(self, error: Exception) -> None: """ - Handle evaluation errors with improved messages. + Re-raise pydantic validation errors from SingleTurnSample with Haystack-friendly field names. - :params error: Original error + :params error: Original error. """ if isinstance(error, ValidationError): - field_mapping = { - "user_input": "query", - "retrieved_contexts": "documents", - } + field_mapping = {"user_input": "query", "retrieved_contexts": "documents"} for err in error.errors(): # loc is a tuple of strings and ints but according to pydantic docs, the first element is a string # https://docs.pydantic.dev/latest/errors/errors/ @@ -217,26 +229,6 @@ def _handle_conversion_error(self, error: Exception) -> None: ) raise ValueError(error_message) - def _handle_evaluation_error(self, error: Exception) -> None: - error_message = str(error) - columns_match = re.search(r"additional columns \[(.*?)\]", error_message) - field_mapping = { - "user_input": "query", - "retrieved_contexts": "documents", - } - if columns_match: - columns_str = columns_match.group(1) - columns = [col.strip().strip("'") for col in columns_str.split(",")] - - mapped_columns = [field_mapping.get(col, col) for col in columns] - updated_columns_str = "[" + ", ".join(f"'{col}'" for col in mapped_columns) + "]" - - # Update the list of columns in the error message - updated_error_message = error_message.replace( - columns_match.group(0), f"additional columns {updated_columns_str}" - ) - raise ValueError(updated_error_message) - def _get_expected_type_description(self, expected_type: Any) -> str: """Helper method to get a description of the expected type.""" if get_origin(expected_type) is Union: @@ -252,21 +244,20 @@ def _get_expected_type_description(self, expected_type: Any) -> str: value_type_name = getattr(value_type, "__name__", str(value_type)) return f"a dictionary with keys of type {key_type_name} and values of type {value_type_name}" else: - # Handle non-generic types or unknown types gracefully return getattr(expected_type, "__name__", str(expected_type)) def _get_example_input(self, field: str) -> str: """ Helper method to get an example input based on the field. - :param field: Arguement used to make SingleTurnSample. + :param field: Argument used to make SingleTurnSample. :returns: Example usage for the field. """ examples = { "query": "A string query like 'Question?'", "documents": "[Document(content='Example content')]", "reference_contexts": "['Example string 1', 'Example string 2']", - "response": "ChatMessage(_content='Hi', _role='assistant')", + "response": "ChatMessage.from_assistant('Hi')", "multi_responses": "['Response 1', 'Response 2']", "reference": "'A reference string'", "rubrics": "{'score1': 'high_similarity'}", diff --git a/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/utils.py b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/utils.py new file mode 100644 index 0000000000..2e25e2f1b3 --- /dev/null +++ b/integrations/ragas/src/haystack_integrations/components/evaluators/ragas/utils.py @@ -0,0 +1,74 @@ +# SPDX-FileCopyrightText: 2026-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +import importlib +from typing import Any + +from openai import AsyncOpenAI + +from ragas.embeddings.base import embedding_factory +from ragas.llms import llm_factory +from ragas.metrics.base import SimpleBaseMetric + + +def _serialize_metric(metric: SimpleBaseMetric) -> dict[str, Any]: + """ + Serialize a `SimpleBaseMetric` to a JSON-compatible dict. + + Stores the class path, metric name, and — when present — the LLM and + embeddings configuration (provider and model name). + + :param metric: The metric instance to serialize. + :returns: A dict suitable for storage in a pipeline YAML or `to_dict` output. + """ + metric_cls = type(metric) + serialized: dict[str, Any] = { + "type": f"{metric_cls.__module__}.{metric_cls.__qualname__}", + "name": metric.name, + } + llm = getattr(metric, "llm", None) + if llm is not None: + serialized["llm"] = {"model": llm.model, "provider": llm.provider} + embeddings = getattr(metric, "embeddings", None) + if embeddings is not None: + serialized["embeddings"] = {"model": embeddings.model, "provider": embeddings.PROVIDER_NAME} + return serialized + + +def _deserialize_metric(data: dict[str, Any]) -> SimpleBaseMetric: + """ + Reconstruct a `SimpleBaseMetric` from a serialized dict. + + Imports the metric class from the stored `type` path and rebuilds any LLM + or embeddings using the stored provider and model name. Only the `openai` + provider is supported for automatic reconstruction; the API key is read from + the `OPENAI_API_KEY` environment variable at deserialization time. + + :param data: Dict produced by `_serialize_metric`. + :returns: A fully constructed `SimpleBaseMetric` instance. + :raises ValueError: If a non-`openai` provider is encountered. + """ + type_path = data["type"] + module_path, class_name = type_path.rsplit(".", 1) + metric_cls = getattr(importlib.import_module(module_path), class_name) + + kwargs: dict[str, Any] = {} + + if "llm" in data: + llm_data = data["llm"] + if llm_data["provider"] != "openai": + msg = f"Automatic deserialization only supports the 'openai' provider; got '{llm_data['provider']}'." + raise ValueError(msg) + kwargs["llm"] = llm_factory(llm_data["model"], client=AsyncOpenAI()) + + if "embeddings" in data: + emb_data = data["embeddings"] + if emb_data["provider"] != "openai": + msg = f"Automatic deserialization only supports the 'openai' provider; got '{emb_data['provider']}'." + raise ValueError(msg) + kwargs["embeddings"] = embedding_factory("openai", model=emb_data["model"], client=AsyncOpenAI()) + + if "name" in data: + kwargs["name"] = data["name"] + + return metric_cls(**kwargs) diff --git a/integrations/ragas/tests/test_evaluator.py b/integrations/ragas/tests/test_evaluator.py index 1929453726..fd6197944b 100644 --- a/integrations/ragas/tests/test_evaluator.py +++ b/integrations/ragas/tests/test_evaluator.py @@ -1,150 +1,361 @@ -import pytest -from unittest import mock +import os from unittest.mock import MagicMock -from ragas.metrics import Metric, Faithfulness -from ragas.llms import BaseRagasLLM -from ragas.embeddings import BaseRagasEmbeddings -from ragas.dataset_schema import EvaluationResult -from haystack import Document + +import pytest +from haystack import Document, Pipeline +from haystack.components.builders import AnswerBuilder, ChatPromptBuilder +from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever +from haystack.dataclasses import ChatMessage +from haystack.document_stores.in_memory import InMemoryDocumentStore +from openai import AsyncOpenAI +from ragas.embeddings.base import embedding_factory +from ragas.llms import llm_factory +from ragas.metrics.base import SimpleBaseMetric +from ragas.metrics.collections import AnswerRelevancy, ContextPrecision, Faithfulness +from ragas.metrics.result import MetricResult + from haystack_integrations.components.evaluators.ragas import RagasEvaluator -# Fixtures -@pytest.fixture -def mock_run(): - """Fixture to mock the 'run' method of RagasEvaluator.""" - with mock.patch.object(RagasEvaluator, 'run') as mock_method: - yield mock_method - - -@pytest.fixture -def ragas_evaluator(): - """Fixture to create a valid RagasEvaluator instance.""" - valid_metrics = [MagicMock(spec=Metric) for _ in range(3)] - valid_llm = MagicMock(spec=BaseRagasLLM) - valid_embedding = MagicMock(spec=BaseRagasEmbeddings) - return RagasEvaluator( - ragas_metrics=valid_metrics, - evaluator_llm=valid_llm, - evaluator_embedding=valid_embedding, - ) +class ConcreteMetric(SimpleBaseMetric): + """Minimal concrete SimpleBaseMetric for serialization tests.""" + def __init__(self, name: str = "concrete_metric", llm=None, embeddings=None): + self.name = name + self.llm = llm + self.embeddings = embeddings -# Tests -def test_successful_initialization(ragas_evaluator): - """Test RagasEvaluator initializes correctly with valid inputs.""" - assert len(ragas_evaluator.metrics) == 3 - assert isinstance(ragas_evaluator.llm, BaseRagasLLM) - assert isinstance(ragas_evaluator.embedding, BaseRagasEmbeddings) + async def ascore(self, user_input: str, response: str) -> MetricResult: + return MetricResult(value=1.0, reason="test") + def score(self, **kwargs) -> MetricResult: + return MetricResult(value=1.0, reason="test") -def test_invalid_metrics(): - """Test RagasEvaluator raises TypeError for invalid metrics.""" - invalid_metric = "not_a_metric" - with pytest.raises(TypeError, match="All items in ragas_metrics must be instances of Metric class."): - RagasEvaluator(ragas_metrics=[invalid_metric]) +def make_metric(name: str, score: float = 0.8, reason: str = "test reason") -> MagicMock: + """Create a mock SimpleBaseMetric with a concrete ascore signature for inspect.signature.""" + metric = MagicMock(spec=SimpleBaseMetric) + metric.name = name + metric.score.return_value = MetricResult(value=score, reason=reason) + async def ascore(user_input: str, response: str, retrieved_contexts: list) -> MetricResult: + return MetricResult(value=score, reason=reason) -def test_invalid_llm(): - """Test RagasEvaluator raises TypeError for invalid evaluator_llm.""" - valid_metric = MagicMock(spec=Metric) - invalid_llm = "not_a_llm" + metric.ascore = ascore + return metric - with pytest.raises(TypeError, match="Expected evaluator_llm to be BaseRagasLLM"): - RagasEvaluator(ragas_metrics=[valid_metric], evaluator_llm=invalid_llm) +class TestInit: + def test_init(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test") + metric = Faithfulness(llm=llm_factory("gpt-4o-mini", client=AsyncOpenAI())) + evaluator = RagasEvaluator(ragas_metrics=[metric]) + assert evaluator.metrics == [metric] -def test_invalid_embedding(): - """Test RagasEvaluator raises TypeError for invalid evaluator_embedding.""" - valid_metric = MagicMock(spec=Metric) - invalid_embedding = "not_an_embedding" + def test_init_with_multiple_metrics(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test") + llm = llm_factory("gpt-4o-mini", client=AsyncOpenAI()) + metrics = [ + Faithfulness(llm=llm), + AnswerRelevancy( + llm=llm, embeddings=embedding_factory("openai", model="text-embedding-3-small", client=AsyncOpenAI()) + ), + ] + evaluator = RagasEvaluator(ragas_metrics=metrics) + assert len(evaluator.metrics) == 2 - with pytest.raises(TypeError, match="Expected evaluator_embedding to be BaseRagasEmbeddings"): - RagasEvaluator(ragas_metrics=[valid_metric], evaluator_embedding=invalid_embedding) + def test_invalid_metrics_raises_type_error(self): + with pytest.raises(TypeError, match="All items in ragas_metrics must be instances of SimpleBaseMetric."): + RagasEvaluator(ragas_metrics=["not_a_metric"]) -def test_initializer_allows_optional_llm_and_embeddings(): - """Test RagasEvaluator initializes correctly with None for optional parameters.""" - valid_metric = MagicMock(spec=Metric) +class TestRun: + def test_run_returns_result_by_metric_name(self, monkeypatch): + metric = make_metric("faithfulness", score=0.9) + evaluator = RagasEvaluator(ragas_metrics=[metric]) + output = evaluator.run( + query="Which is the most popular global sport?", + response="Football is the most popular sport.", + documents=["Football is undoubtedly the world's most popular sport."], + ) + assert "result" in output + assert "faithfulness" in output["result"] + result = output["result"]["faithfulness"] + assert isinstance(result, MetricResult) + assert result.value == 0.9 - evaluator = RagasEvaluator( - ragas_metrics=[valid_metric], - evaluator_llm=None, - evaluator_embedding=None, - ) - assert evaluator.metrics == [valid_metric] - assert evaluator.llm is None - assert evaluator.embedding is None - - -@pytest.mark.parametrize( - "invalid_input,field_name,error_message", - [ - (["Invalid query type"], "query", "'query' field expected"), - ([123, ["Invalid document"]], "documents", "'documents' must be a list"), - (["score_1"], "rubrics", "'rubrics' field expected"), - ], -) -def test_run_invalid_inputs(invalid_input, field_name, error_message): - """Test RagasEvaluator raises ValueError for invalid input types.""" - evaluator = RagasEvaluator(ragas_metrics=[Faithfulness()]) - query = "Which is the most popular global sport?" - documents = ["Football is the most popular sport."] - response = "Football is the most popular sport in the world" - - with pytest.raises(ValueError) as exc_info: - if field_name == "query": - evaluator.run(query=invalid_input, documents=documents, response=response) - elif field_name == "documents": - evaluator.run(query=query, documents=invalid_input, response=response) - elif field_name == "rubrics": - evaluator.run(query=query, rubrics=invalid_input, documents=documents, response=response) - - assert error_message in str(exc_info.value) - - -def test_missing_columns_in_dataset(): - """Test if RagasEvaluator raises a ValueError when required columns are missing for a specific metric.""" - evaluator = RagasEvaluator(ragas_metrics=[Faithfulness()]) - query = "Which is the most popular global sport?" - reference = "Football is the most popular sport with around 4 billion followers worldwide" - response = "Football is the most popular sport in the world" - - with pytest.raises(ValueError) as exc_info: - evaluator.run(query=query, reference=reference, response=response) - - assert "faithfulness" in str(exc_info.value) - assert "documents" in str(exc_info.value) - - -def test_run_valid_input(mock_run): - """Test RagasEvaluator runs successfully with valid input.""" - mock_run.return_value = {"result": {"score": MagicMock(), "details": MagicMock(spec=EvaluationResult)}} - evaluator = RagasEvaluator(ragas_metrics=[MagicMock(Metric)]) - - query = "Which is the most popular global sport?" - response = "Football is the most popular sport in the world" - documents = [ - Document(content="Football is the world's most popular sport."), - Document(content="Football has over 4 billion followers."), - ] - reference_contexts = ["Football is a globally popular sport."] - multi_responses = ["Football is considered the most popular sport."] - reference = "Football is the most popular sport with around 4 billion followers worldwide" - rubrics = {"accuracy": "high", "relevance": "high"} - - output = evaluator.run( - query=query, - response=response, - documents=documents, - reference_contexts=reference_contexts, - multi_responses=multi_responses, - reference=reference, - rubrics=rubrics, + def test_run_scores_all_metrics(self): + metrics = [make_metric("faithfulness", 0.9), make_metric("answer_relevancy", 0.7)] + evaluator = RagasEvaluator(ragas_metrics=metrics) + output = evaluator.run(query="test?", response="answer", documents=["doc"]) + assert set(output["result"].keys()) == {"faithfulness", "answer_relevancy"} + assert output["result"]["faithfulness"].value == 0.9 + assert output["result"]["answer_relevancy"].value == 0.7 + + def test_run_calls_score_on_each_metric(self): + metric_a = make_metric("faithfulness") + metric_b = make_metric("answer_relevancy") + evaluator = RagasEvaluator(ragas_metrics=[metric_a, metric_b]) + evaluator.run(query="test?", response="answer", documents=["doc"]) + metric_a.score.assert_called_once() + metric_b.score.assert_called_once() + + def test_score_metric_passes_only_matching_params(self): + """Metric that only needs user_input + response should not receive retrieved_contexts.""" + metric = MagicMock(spec=SimpleBaseMetric) + metric.name = "selective_metric" + metric.score.return_value = MetricResult(value=0.5, reason="ok") + + async def ascore(user_input: str, response: str) -> MetricResult: + return MetricResult(value=0.5, reason="ok") + + metric.ascore = ascore + + evaluator = RagasEvaluator(ragas_metrics=[metric]) + evaluator.run(query="test?", response="answer", documents=["doc"], reference="ref") + metric.score.assert_called_once_with(user_input="test?", response="answer") + + def test_score_metric_omits_none_fields(self): + metric = make_metric("faithfulness") + evaluator = RagasEvaluator(ragas_metrics=[metric]) + evaluator.run(query="test?", response="answer") # no documents → retrieved_contexts=None + _, kwargs = metric.score.call_args + assert "retrieved_contexts" not in kwargs + + def test_run_accepts_document_objects(self): + metric = make_metric("faithfulness") + evaluator = RagasEvaluator(ragas_metrics=[metric]) + evaluator.run( + query="test?", + response="answer", + documents=[Document(content="some content"), Document(content="more content")], + ) + _, kwargs = metric.score.call_args + assert kwargs["retrieved_contexts"] == ["some content", "more content"] + + def test_run_accepts_string_documents(self): + metric = make_metric("faithfulness") + evaluator = RagasEvaluator(ragas_metrics=[metric]) + evaluator.run(query="test?", response="answer", documents=["doc one", "doc two"]) + _, kwargs = metric.score.call_args + assert kwargs["retrieved_contexts"] == ["doc one", "doc two"] + + @pytest.mark.parametrize( + "invalid_input,field_name,error_message", + [ + (["Invalid query type"], "query", "'query' field expected"), + ([123, ["Invalid document"]], "documents", "'documents' must be a list"), + (["score_1"], "rubrics", "'rubrics' field expected"), + ], ) + def test_run_raises_on_invalid_input_types(self, invalid_input, field_name, error_message): + evaluator = RagasEvaluator(ragas_metrics=[make_metric("faithfulness")]) + query = "Which is the most popular global sport?" + documents = ["Football is the most popular sport."] + response = "Football is the most popular sport in the world" + + with pytest.raises(ValueError) as exc_info: + if field_name == "query": + evaluator.run(query=invalid_input, documents=documents, response=response) + elif field_name == "documents": + evaluator.run(query=query, documents=invalid_input, response=response) + elif field_name == "rubrics": + evaluator.run(query=query, rubrics=invalid_input, documents=documents, response=response) + + assert error_message in str(exc_info.value) + + +class TestSerialization: + def test_to_dict(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test") + llm = llm_factory("gpt-4o-mini", client=AsyncOpenAI()) + evaluator = RagasEvaluator(ragas_metrics=[ConcreteMetric(llm=llm), ConcreteMetric(name="another_metric")]) + data = evaluator.to_dict() + assert data == { + "type": "haystack_integrations.components.evaluators.ragas.evaluator.RagasEvaluator", + "init_parameters": { + "ragas_metrics": [ + { + "type": "tests.test_evaluator.ConcreteMetric", + "name": "concrete_metric", + "llm": {"model": "gpt-4o-mini", "provider": "openai"}, + }, + {"type": "tests.test_evaluator.ConcreteMetric", "name": "another_metric"}, + ] + }, + } + + def test_from_dict(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test") + data = { + "type": "haystack_integrations.components.evaluators.ragas.evaluator.RagasEvaluator", + "init_parameters": { + "ragas_metrics": [ + { + "type": "tests.test_evaluator.ConcreteMetric", + "name": "concrete_metric", + "llm": {"model": "gpt-4o-mini", "provider": "openai"}, + }, + ], + }, + } + reconstructed = RagasEvaluator.from_dict(data) + assert len(reconstructed.metrics) == 1 + assert reconstructed.metrics[0].name == "concrete_metric" + + def test_from_dict_raises_for_unsupported_provider(self): + data = { + "type": "haystack_integrations.components.evaluators.ragas.evaluator.RagasEvaluator", + "init_parameters": { + "ragas_metrics": [ + { + "type": "tests.test_evaluator.ConcreteMetric", + "name": "some_metric", + "llm": {"model": "gemini-pro", "provider": "google"}, + } + ] + }, + } + + with pytest.raises(ValueError, match="only supports the 'openai' provider"): + RagasEvaluator.from_dict(data) + + +@pytest.mark.skipif(not os.environ.get("OPENAI_API_KEY"), reason="Set OPENAI_API_KEY to run integration tests.") +@pytest.mark.integration +class TestStandaloneEvaluationIntegration: + def make_llm(self): + return llm_factory("gpt-4o-mini", client=AsyncOpenAI()) + + def make_embeddings(self): + return embedding_factory("openai", model="text-embedding-3-small", client=AsyncOpenAI()) + + def test_faithfulness_returns_valid_score(self): + evaluator = RagasEvaluator(ragas_metrics=[Faithfulness(llm=self.make_llm())]) + + output = evaluator.run( + query="What makes Meta AI's LLaMA models stand out?", + response="Meta AI's LLaMA models stand out for being open-source.", + documents=[ + "Meta AI is best known for its LLaMA series, which has been made open-source " + "for researchers and developers. LLaMA models are praised for their ability to " + "support innovation and experimentation due to their accessibility." + ], + ) + + result = output["result"]["faithfulness"] + assert isinstance(result, MetricResult) + assert 0.0 <= result.value <= 1.0 + + def test_answer_relevancy_uses_only_query_and_response(self): + """AnswerRelevancy only declares user_input + response in ascore — documents should not be forwarded.""" + evaluator = RagasEvaluator( + ragas_metrics=[AnswerRelevancy(llm=self.make_llm(), embeddings=self.make_embeddings())] + ) + + output = evaluator.run( + query="What makes Meta AI's LLaMA models stand out?", + response="They are open-source and freely available to researchers.", + documents=["Meta AI released LLaMA as an open-source model."], + ) + + result = output["result"]["answer_relevancy"] + assert isinstance(result, MetricResult) + assert 0.0 <= result.value <= 1.0 + + def test_multiple_metrics_all_return_results(self): + llm = self.make_llm() + embeddings = self.make_embeddings() + evaluator = RagasEvaluator( + ragas_metrics=[ + Faithfulness(llm=llm), + AnswerRelevancy(llm=llm, embeddings=embeddings), + ContextPrecision(llm=llm), + ] + ) + + output = evaluator.run( + query="What makes Meta AI's LLaMA models stand out?", + response=( + "Meta AI's LLaMA models stand out for being open-source, supporting " + "innovation and experimentation due to their accessibility and strong performance." + ), + documents=[ + "Meta AI is best known for its LLaMA series, which has been made open-source.", + "Meta AI with its LLaMA models aims to democratize AI development by making " + "high-quality models available for free, fostering collaboration across industries.", + ], + reference=( + "Meta AI's LLaMA models stand out for being open-source, supporting innovation " + "and experimentation due to their accessibility and strong performance." + ), + ) + + assert set(output["result"].keys()) == {"faithfulness", "answer_relevancy", "context_precision"} + for metric_result in output["result"].values(): + assert isinstance(metric_result, MetricResult) + assert 0.0 <= metric_result.value <= 1.0 + + +@pytest.mark.skipif(not os.environ.get("OPENAI_API_KEY"), reason="Set OPENAI_API_KEY to run integration tests.") +@pytest.mark.integration +class TestPipelineIntegration: + def test_ragas_evaluator_in_rag_pipeline(self): + dataset = [ + "Meta AI is best known for its LLaMA series, which has been made open-source " + "for researchers and developers.", + "LLaMA models are praised for their ability to support innovation and " + "experimentation due to their accessibility and strong performance.", + "Meta AI with its LLaMA models aims to democratize AI development by making " + "high-quality models available for free.", + ] + + document_store = InMemoryDocumentStore() + docs = [Document(content=text) for text in dataset] + document_embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small") + document_store.write_documents(document_embedder.run(docs)["documents"]) + + ragas_evaluator = RagasEvaluator( + ragas_metrics=[Faithfulness(llm=llm_factory("gpt-4o-mini", client=AsyncOpenAI()))] + ) + + template = [ + ChatMessage.from_user( + "Answer the question based on the context.\n\n" + "Context:\n{% for document in documents %}{{ document.content }}\n{% endfor %}\n\n" + "Question: {{question}}\nAnswer:" + ) + ] + + pipeline = Pipeline() + pipeline.add_component("text_embedder", OpenAITextEmbedder(model="text-embedding-3-small")) + pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=2)) + pipeline.add_component("prompt_builder", ChatPromptBuilder(template=template, required_variables="*")) + pipeline.add_component("llm", OpenAIChatGenerator(model="gpt-4o-mini")) + pipeline.add_component("answer_builder", AnswerBuilder()) + pipeline.add_component("ragas_evaluator", ragas_evaluator) + + pipeline.connect("text_embedder.embedding", "retriever.query_embedding") + pipeline.connect("retriever", "prompt_builder") + pipeline.connect("prompt_builder.prompt", "llm.messages") + pipeline.connect("llm.replies", "answer_builder.replies") + pipeline.connect("retriever", "answer_builder.documents") + pipeline.connect("retriever", "ragas_evaluator.documents") + pipeline.connect("llm.replies", "ragas_evaluator.response") + + question = "What makes Meta AI's LLaMA models stand out?" + result = pipeline.run( + { + "text_embedder": {"text": question}, + "prompt_builder": {"question": question}, + "answer_builder": {"query": question}, + "ragas_evaluator": {"query": question}, + } + ) - assert "result" in output - assert isinstance(output["result"], dict) - assert "score" in output["result"] - assert isinstance(output["result"]["details"], EvaluationResult) + assert "ragas_evaluator" in result + faithfulness_result = result["ragas_evaluator"]["result"]["faithfulness"] + assert isinstance(faithfulness_result, MetricResult) + assert 0.0 <= faithfulness_result.value <= 1.0 diff --git a/integrations/ragas/tests/test_utils.py b/integrations/ragas/tests/test_utils.py new file mode 100644 index 0000000000..d45b7999b1 --- /dev/null +++ b/integrations/ragas/tests/test_utils.py @@ -0,0 +1,82 @@ +import pytest +from openai import AsyncOpenAI +from ragas.embeddings.base import embedding_factory +from ragas.llms import llm_factory +from ragas.metrics.base import SimpleBaseMetric +from ragas.metrics.result import MetricResult + +from haystack_integrations.components.evaluators.ragas.utils import _deserialize_metric, _serialize_metric + + +class ConcreteMetric(SimpleBaseMetric): + """Minimal concrete SimpleBaseMetric for serialization tests.""" + + def __init__(self, name: str = "concrete_metric", llm=None, embeddings=None): + self.name = name + self.llm = llm + self.embeddings = embeddings + + async def ascore(self, user_input: str, response: str) -> MetricResult: + return MetricResult(value=1.0, reason="test") + + def score(self, **kwargs) -> MetricResult: + return MetricResult(value=1.0, reason="test") + + +def test_serialization(monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test") + result = _serialize_metric( + ConcreteMetric( + llm=llm_factory("gpt-4o-mini", client=AsyncOpenAI()), + embeddings=embedding_factory("openai", model="text-embedding-3-small", client=AsyncOpenAI()), + ) + ) + assert result == { + "type": "tests.test_utils.ConcreteMetric", + "name": "concrete_metric", + "llm": {"model": "gpt-4o-mini", "provider": "openai"}, + "embeddings": {"model": "text-embedding-3-small", "provider": "openai"}, + } + + +class TestDeserializeMetric: + def test_deserialization(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test") + data = { + "type": "tests.test_utils.ConcreteMetric", + "name": "concrete_metric", + "llm": {"model": "gpt-4o-mini", "provider": "openai"}, + "embeddings": {"model": "text-embedding-3-small", "provider": "openai"}, + } + result = _deserialize_metric(data) + assert isinstance(result, ConcreteMetric) + assert result.name == "concrete_metric" + assert result.llm.model == "gpt-4o-mini" + assert result.embeddings.model == "text-embedding-3-small" + + def test_raises_for_unsupported_llm_provider(self): + data = { + "type": "tests.test_utils.ConcreteMetric", + "name": "concrete_metric", + "llm": {"model": "gemini-pro", "provider": "google"}, + } + + with pytest.raises(ValueError, match="only supports the 'openai' provider"): + _deserialize_metric(data) + + def test_raises_for_unsupported_embeddings_provider(self): + data = { + "type": "tests.test_utils.ConcreteMetric", + "name": "concrete_metric", + "embeddings": {"model": "embedding-001", "provider": "google"}, + } + + with pytest.raises(ValueError, match="only supports the 'openai' provider"): + _deserialize_metric(data) + + def test_round_trip(self): + metric = ConcreteMetric(name="round_trip") + result = _deserialize_metric(_serialize_metric(metric)) + + assert isinstance(result, ConcreteMetric) + assert result.name == "round_trip"