From 07d7cd4e7863a73b8b1b9f9abb0bbe8fe0f55e7e Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Tue, 10 Jun 2025 10:21:04 +0200 Subject: [PATCH 1/6] Types cleanup --- .../google_genai/document_embedder.py | 26 +++++++++++++------ .../embedders/google_genai/text_embedder.py | 5 ++-- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py index 4f143a07e0..d8fb95e6d8 100644 --- a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py +++ b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py @@ -23,7 +23,7 @@ class GoogleGenAIDocumentEmbedder: ```python from haystack import Document - from haystack_integrations.components.embedders import GoogleGenAIDocumentEmbedder + from haystack_integrations.components.embedders.google_genai import GoogleGenAIDocumentEmbedder doc = Document(content="I love pizza!") @@ -48,7 +48,7 @@ def __init__( meta_fields_to_embed: Optional[List[str]] = None, embedding_separator: str = "\n", config: Optional[Dict[str, Any]] = None, - ): + ) -> None: """ Creates an GoogleGenAIDocumentEmbedder component. @@ -139,10 +139,13 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: return texts_to_embed - def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List[List[float]], Dict[str, Any]]: + def _embed_batch( + self, texts_to_embed: List[str], batch_size: int + ) -> Tuple[List[Optional[List[float]]], Dict[str, Any]]: """ Embed a list of texts in batches. """ + resolved_config = types.EmbedContentConfig(**self._config) if self._config else None all_embeddings = [] meta: Dict[str, Any] = {} @@ -150,13 +153,19 @@ def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List batched(texts_to_embed, batch_size), disable=not self._progress_bar, desc="Calculating embeddings" ): args: Dict[str, Any] = {"model": self._model, "contents": [b[1] for b in batch]} - if self._config: - args["config"] = types.EmbedContentConfig(**self._config) if self._config else None + if resolved_config: + args["config"] = resolved_config response = self._client.models.embed_content(**args) - embeddings = [el.values for el in response.embeddings] - all_embeddings.extend(embeddings) + # TODO Decide if we should return None or empty List + embeddings = [] + if response.embeddings: + for el in response.embeddings: + embeddings.append(el.values if el.values else None) + all_embeddings.extend(embeddings) + else: + all_embeddings.extend([None] * len(batch)) if "model" not in meta: meta["model"] = self._model @@ -164,7 +173,7 @@ def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List return all_embeddings, meta @component.output_types(documents=List[Document], meta=Dict[str, Any]) - def run(self, documents: List[Document]) -> Dict[str, Union[List[Document], Dict[str, Any]]]: + def run(self, documents: List[Document]) -> Union[Dict[str, List[Document]], Dict[str, Any]]: """ Embeds a list of documents. @@ -185,6 +194,7 @@ def run(self, documents: List[Document]) -> Dict[str, Union[List[Document], Dict texts_to_embed = self._prepare_texts_to_embed(documents=documents) + meta: Dict[str, Any] embeddings, meta = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self._batch_size) for doc, emb in zip(documents, embeddings): diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py index 415d5fc21d..e885088e13 100644 --- a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py +++ b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py @@ -44,7 +44,7 @@ def __init__( prefix: str = "", suffix: str = "", config: Optional[Dict[str, Any]] = None, - ): + ) -> None: """ Creates an GoogleGenAITextEmbedder component. @@ -119,7 +119,8 @@ def _prepare_input(self, text: str) -> Dict[str, Any]: return kwargs def _prepare_output(self, result: types.EmbedContentResponse) -> Dict[str, Any]: - return {"embedding": result.embeddings[0].values, "meta": {"model": self._model_name}} + embedding = result.embeddings[0].values if result.embeddings else [] + return {"embedding": embedding, "meta": {"model": self._model_name}} @component.output_types(embedding=List[float], meta=Dict[str, Any]) def run(self, text: str) -> Union[Dict[str, List[float]], Dict[str, Any]]: From 25b8d44e839b29ef4c23d81de7536bfcf0302f2e Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Tue, 10 Jun 2025 11:27:31 +0200 Subject: [PATCH 2/6] Add more tests --- .../tests/test_document_embedder.py | 33 +++++++++++++++++-- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/integrations/google_genai/tests/test_document_embedder.py b/integrations/google_genai/tests/test_document_embedder.py index 31e55baf43..680474d82c 100644 --- a/integrations/google_genai/tests/test_document_embedder.py +++ b/integrations/google_genai/tests/test_document_embedder.py @@ -138,6 +138,35 @@ def test_to_dict_with_custom_init_parameters(self, monkeypatch): }, } + def test_from_dict(self, monkeypatch): + data = { + "type": ( + "haystack_integrations.components.embedders.google_genai.document_embedder.GoogleGenAIDocumentEmbedder" + ), + "init_parameters": { + "model": "text-embedding-004", + "prefix": "", + "suffix": "", + "batch_size": 32, + "progress_bar": True, + "meta_fields_to_embed": [], + "embedding_separator": "\n", + "api_key": {"type": "env_var", "env_vars": ["GOOGLE_API_KEY"], "strict": True}, + "config": {"task_type": "SEMANTIC_SIMILARITY"}, + }, + } + monkeypatch.setenv("GOOGLE_API_KEY", "fake-api-key") + embedder = GoogleGenAIDocumentEmbedder.from_dict(data) + assert embedder._api_key.resolve_value() == "fake-api-key" + assert embedder._model == "text-embedding-004" + assert embedder._prefix == "" + assert embedder._suffix == "" + assert embedder._batch_size == 32 + assert embedder._progress_bar is True + assert embedder._meta_fields_to_embed == [] + assert embedder._embedding_separator == "\n" + assert embedder._config == {"task_type": "SEMANTIC_SIMILARITY"} + def test_prepare_texts_to_embed_w_metadata(self): documents = [ Document(id=f"{i}", content=f"document number {i}:\ncontent", meta={"meta_field": f"meta_value {i}"}) @@ -204,6 +233,4 @@ def test_run(self): assert len(doc.embedding) == 768 assert all(isinstance(x, float) for x in doc.embedding) - assert "text" in result["meta"]["model"] and "004" in result["meta"]["model"], ( - "The model name does not contain 'text' and '004'" - ) + assert result["documents"][0].meta["model"] == model From f8170037ed288fa3136451ce6b06049ede7a263f Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Tue, 10 Jun 2025 11:28:50 +0200 Subject: [PATCH 3/6] Fix more tests --- integrations/google_genai/tests/test_document_embedder.py | 2 +- integrations/google_genai/tests/test_text_embedder.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/integrations/google_genai/tests/test_document_embedder.py b/integrations/google_genai/tests/test_document_embedder.py index 680474d82c..19907a9cb6 100644 --- a/integrations/google_genai/tests/test_document_embedder.py +++ b/integrations/google_genai/tests/test_document_embedder.py @@ -233,4 +233,4 @@ def test_run(self): assert len(doc.embedding) == 768 assert all(isinstance(x, float) for x in doc.embedding) - assert result["documents"][0].meta["model"] == model + assert result["documents"][0].meta == {"model": model} diff --git a/integrations/google_genai/tests/test_text_embedder.py b/integrations/google_genai/tests/test_text_embedder.py index bb700527be..fe67907b91 100644 --- a/integrations/google_genai/tests/test_text_embedder.py +++ b/integrations/google_genai/tests/test_text_embedder.py @@ -160,6 +160,4 @@ def test_run(self): assert len(result["embedding"]) == 768 assert all(isinstance(x, float) for x in result["embedding"]) - assert "text" in result["meta"]["model"] and "004" in result["meta"]["model"], ( - "The model name does not contain 'text' and '004'" - ) + assert result["meta"] == {"model": model} From 44fa97c8f68c50b7b855289ab12a407ec87af04e Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Tue, 10 Jun 2025 11:32:16 +0200 Subject: [PATCH 4/6] Remove todo --- .../components/embedders/google_genai/document_embedder.py | 1 - 1 file changed, 1 deletion(-) diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py index d8fb95e6d8..130cc437cf 100644 --- a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py +++ b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py @@ -158,7 +158,6 @@ def _embed_batch( response = self._client.models.embed_content(**args) - # TODO Decide if we should return None or empty List embeddings = [] if response.embeddings: for el in response.embeddings: From 6437a69b3815d8c17e28a29ce73aa3ce73b883a4 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Tue, 10 Jun 2025 11:38:09 +0200 Subject: [PATCH 5/6] Fix test --- integrations/google_genai/tests/test_document_embedder.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/integrations/google_genai/tests/test_document_embedder.py b/integrations/google_genai/tests/test_document_embedder.py index 19907a9cb6..a182eaeda2 100644 --- a/integrations/google_genai/tests/test_document_embedder.py +++ b/integrations/google_genai/tests/test_document_embedder.py @@ -233,4 +233,5 @@ def test_run(self): assert len(doc.embedding) == 768 assert all(isinstance(x, float) for x in doc.embedding) - assert result["documents"][0].meta == {"model": model} + assert result["documents"][0].meta == {"model": model, "topic": "Cuisine"} + assert result["documents"][1].meta == {"model": model, "topic": "ML"} From eef397df4874e35273aed1bf1342eb47f2657da2 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Tue, 10 Jun 2025 11:44:20 +0200 Subject: [PATCH 6/6] Fix tests --- integrations/google_genai/tests/test_document_embedder.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/integrations/google_genai/tests/test_document_embedder.py b/integrations/google_genai/tests/test_document_embedder.py index a182eaeda2..438be2ffbf 100644 --- a/integrations/google_genai/tests/test_document_embedder.py +++ b/integrations/google_genai/tests/test_document_embedder.py @@ -233,5 +233,6 @@ def test_run(self): assert len(doc.embedding) == 768 assert all(isinstance(x, float) for x in doc.embedding) - assert result["documents"][0].meta == {"model": model, "topic": "Cuisine"} - assert result["documents"][1].meta == {"model": model, "topic": "ML"} + assert result["documents"][0].meta == {"topic": "Cuisine"} + assert result["documents"][1].meta == {"topic": "ML"} + assert result["meta"] == {"model": model}