From 0f5538c02f67cc88596783a4373f1e3cd3da6087 Mon Sep 17 00:00:00 2001
From: GunaPalanivel <gp5901@srmist.edu.in>
Date: Sat, 17 Jan 2026 10:47:26 +0530
Subject: [PATCH] fix(google_genai): use dataclass replace to avoid modifying
 input documents

This PR fixes the Google GenAI document embedder to not modify input
Documents in place when setting embeddings.

Instead of mutating the original documents:
  doc.embedding = embeddings

We now create new document instances using dataclass replace:
  replace(doc, embedding=embeddings)

This follows the established pattern from haystack-ai/haystack#9693 and
aligns with other integrations (FastEmbed, Optimum, Nvidia, Bedrock, Cohere).

Related to: deepset-ai/haystack-core-integrations#2174
---
 .../google_genai/document_embedder.py         | 11 ++--
 .../tests/test_document_embedder.py           | 55 +++++++++++++++++++
 2 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py
index a64689cfef..b8b9ef862d 100644
--- a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py
+++ b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from dataclasses import replace
 from typing import Any, Literal
 
 from google.genai import types
@@ -281,10 +282,11 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]] | dict[str
         meta: dict[str, Any]
         embeddings, meta = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self._batch_size)
 
+        new_documents = []
         for doc, emb in zip(documents, embeddings, strict=True):
-            doc.embedding = emb
+            new_documents.append(replace(doc, embedding=emb))
 
-        return {"documents": documents, "meta": meta}
+        return {"documents": new_documents, "meta": meta}
 
     @component.output_types(documents=list[Document], meta=dict[str, Any])
     async def run_async(self, documents: list[Document]) -> dict[str, list[Document]] | dict[str, Any]:
@@ -310,7 +312,8 @@ async def run_async(self, documents: list[Document]) -> dict[str, list[Document]
 
         embeddings, meta = await self._embed_batch_async(texts_to_embed=texts_to_embed, batch_size=self._batch_size)
 
+        new_documents = []
         for doc, emb in zip(documents, embeddings, strict=True):
-            doc.embedding = emb
+            new_documents.append(replace(doc, embedding=emb))
 
-        return {"documents": documents, "meta": meta}
+        return {"documents": new_documents, "meta": meta}
diff --git a/integrations/google_genai/tests/test_document_embedder.py b/integrations/google_genai/tests/test_document_embedder.py
index 2579801bc6..d2f1af4985 100644
--- a/integrations/google_genai/tests/test_document_embedder.py
+++ b/integrations/google_genai/tests/test_document_embedder.py
@@ -201,6 +201,61 @@ def test_run_on_empty_list(self):
         assert result["documents"] is not None
         assert not result["documents"]  # empty list
 
+    def test_run_does_not_modify_original_documents(self, monkeypatch):
+        monkeypatch.setenv("GOOGLE_API_KEY", "fake-api-key")
+        embedder = GoogleGenAIDocumentEmbedder()
+
+        docs = [
+            Document(content="I love cheese", meta={"topic": "Cuisine"}),
+            Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}),
+        ]
+
+        # Mock the _embed_batch method to return fake embeddings
+        def mock_embed_batch(texts_to_embed, batch_size):
+            embeddings = [[0.1, 0.2, 0.3] for _ in texts_to_embed]
+            meta = {"model": "text-embedding-004"}
+            return embeddings, meta
+
+        embedder._embed_batch = mock_embed_batch
+
+        result = embedder.run(documents=docs)
+
+        # Check that the original documents are not modified
+        for doc in docs:
+            assert doc.embedding is None
+
+        # Check that the returned documents have embeddings
+        for doc_with_embedding in result["documents"]:
+            assert doc_with_embedding.embedding == [0.1, 0.2, 0.3]
+
+    @pytest.mark.asyncio
+    async def test_run_async_does_not_modify_original_documents(self, monkeypatch):
+        monkeypatch.setenv("GOOGLE_API_KEY", "fake-api-key")
+        embedder = GoogleGenAIDocumentEmbedder()
+
+        docs = [
+            Document(content="I love cheese", meta={"topic": "Cuisine"}),
+            Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}),
+        ]
+
+        # Mock the _embed_batch_async method to return fake embeddings
+        async def mock_embed_batch_async(texts_to_embed, batch_size):
+            embeddings = [[0.1, 0.2, 0.3] for _ in texts_to_embed]
+            meta = {"model": "text-embedding-004"}
+            return embeddings, meta
+
+        embedder._embed_batch_async = mock_embed_batch_async
+
+        result = await embedder.run_async(documents=docs)
+
+        # Check that the original documents are not modified
+        for doc in docs:
+            assert doc.embedding is None
+
+        # Check that the returned documents have embeddings
+        for doc_with_embedding in result["documents"]:
+            assert doc_with_embedding.embedding == [0.1, 0.2, 0.3]
+
     @pytest.mark.skipif(
         not os.environ.get("GOOGLE_API_KEY", None),
         reason="Export an env var called GOOGLE_API_KEY containing the Google API key to run this test.",