From a01ec38087d77b7ecef07fc71aba67e6e107a50e Mon Sep 17 00:00:00 2001 From: GunaPalanivel Date: Tue, 20 Jan 2026 18:44:13 +0530 Subject: [PATCH] fix(watsonx): use dataclass replace to avoid modifying input documents This PR fixes the Watson X Document Embedder to avoid mutating input Documents when setting embeddings. Instead of mutating the original documents: doc.embedding = emb We now create new document instances using dataclass replace: replace(doc, embedding=emb) This follows the established pattern from haystack-ai/haystack#9693 and aligns with other integrations (FastEmbed, Optimum, Nvidia, Bedrock, Cohere, Google GenAI, Jina). Related to: deepset-ai/haystack-core-integrations#2174 --- .../embedders/watsonx/document_embedder.py | 6 +++-- .../watsonx/tests/test_document_embedder.py | 24 +++++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/integrations/watsonx/src/haystack_integrations/components/embedders/watsonx/document_embedder.py b/integrations/watsonx/src/haystack_integrations/components/embedders/watsonx/document_embedder.py index 4b4b536e2e..626cea26ed 100644 --- a/integrations/watsonx/src/haystack_integrations/components/embedders/watsonx/document_embedder.py +++ b/integrations/watsonx/src/haystack_integrations/components/embedders/watsonx/document_embedder.py @@ -4,6 +4,7 @@ from __future__ import annotations +from dataclasses import replace from typing import Any from haystack import Document, component, default_from_dict, default_to_dict @@ -204,11 +205,12 @@ def run(self, documents: list[Document]) -> dict[str, list[Document] | dict[str, texts_to_embed = self._prepare_texts_to_embed(documents=documents) embeddings = self.embedder.embed_documents(texts_to_embed) + new_documents = [] for doc, emb in zip(documents, embeddings, strict=True): - doc.embedding = emb + new_documents.append(replace(doc, embedding=emb)) return { - "documents": documents, + "documents": new_documents, "meta": { "model": self.model, "truncate_input_tokens": self.truncate_input_tokens, diff --git a/integrations/watsonx/tests/test_document_embedder.py b/integrations/watsonx/tests/test_document_embedder.py index 6a4c50f71a..f968fdf56d 100644 --- a/integrations/watsonx/tests/test_document_embedder.py +++ b/integrations/watsonx/tests/test_document_embedder.py @@ -175,6 +175,30 @@ def test_run_empty_documents(self, mock_watsonx): "meta": {"model": "ibm/slate-30m-english-rtrvr-v2", "truncate_input_tokens": None, "batch_size": 1000}, } + def test_run_does_not_modify_original_documents(self, mock_watsonx): + """Test that original documents are not modified during embedding""" + embedder = WatsonxDocumentEmbedder(project_id=Secret.from_token("fake-project-id")) + original_docs = [ + Document(content="I love cheese"), + Document(content="A transformer is a deep learning architecture"), + ] + + # Mock the embedder to return embeddings + mock_watsonx["embeddings_instance"].embed_documents.return_value = [ + [0.1, 0.2, 0.3], + [0.4, 0.5, 0.6], + ] + + result = embedder.run(documents=original_docs) + + # Check that original documents are not modified + for doc in original_docs: + assert doc.embedding is None + + # Check that returned documents have embeddings + for doc_with_embedding in result["documents"]: + assert doc_with_embedding.embedding is not None + @pytest.mark.integration class TestWatsonxDocumentEmbedderIntegration: