Skip to content

Commit 38525c0

Browse files
committed
refactor: format code for better readability and consistency in document embedder
1 parent f20bdff commit 38525c0

2 files changed

Lines changed: 27 additions & 18 deletions

File tree

integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,8 @@ def __init__(
8585
self._meta_fields_to_embed = meta_fields_to_embed or []
8686
self._embedding_separator = embedding_separator
8787
self._client = genai.Client(api_key=api_key.resolve_value())
88-
self._config = config if config is not None else {"task_type": "SEMANTIC_SIMILARITY"}
88+
self._config = config if config is not None else {
89+
"task_type": "SEMANTIC_SIMILARITY"}
8990

9091
def to_dict(self) -> Dict[str, Any]:
9192
"""
@@ -127,14 +128,14 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> Dict[str, str]:
127128
texts_to_embed: List[str] = []
128129
for doc in documents:
129130
meta_values_to_embed = [
130-
str(doc.meta[key])
131-
for key in
132-
self._meta_fields_to_embed
131+
str(doc.meta[key])
132+
for key in self._meta_fields_to_embed
133133
if key in doc.meta and doc.meta[key] is not None
134134
]
135135

136136
text_to_embed = (
137-
self._prefix + self._embedding_separator.join([*meta_values_to_embed, doc.content or ""]) + self._suffix
137+
self._prefix + self._embedding_separator.join(
138+
[*meta_values_to_embed, doc.content or ""]) + self._suffix
138139
)
139140
texts_to_embed.append(text_to_embed)
140141

@@ -150,9 +151,11 @@ def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List
150151
for batch in tqdm(
151152
batched(texts_to_embed, batch_size), disable=not self._progress_bar, desc="Calculating embeddings"
152153
):
153-
args: Dict[str, Any] = {"model": self._model, "contents": [b[1] for b in batch]}
154+
args: Dict[str, Any] = {"model": self._model,
155+
"contents": [b[1] for b in batch]}
154156
if self._config:
155-
args["config"] = types.EmbedContentConfig(**self._config) if self._config else None
157+
args["config"] = types.EmbedContentConfig(
158+
**self._config) if self._config else None
156159

157160
response = self._client.models.embed_content(**args)
158161

@@ -186,7 +189,8 @@ def run(self, documents: List[Document]) -> Dict[str, Union[List[Document], Dict
186189

187190
texts_to_embed = self._prepare_texts_to_embed(documents=documents)
188191

189-
embeddings, meta = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self._batch_size)
192+
embeddings, meta = self._embed_batch(
193+
texts_to_embed=texts_to_embed, batch_size=self._batch_size)
190194

191195
for doc, emb in zip(documents, embeddings):
192196
doc.embedding = emb

integrations/google_genai/tests/test_document_embedder.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,8 @@ def test_to_dict_with_custom_init_parameters(self, monkeypatch):
142142

143143
def test_prepare_texts_to_embed_w_metadata(self):
144144
documents = [
145-
Document(id=f"{i}", content=f"document number {i}:\ncontent", meta={"meta_field": f"meta_value {i}"})
145+
Document(id=f"{i}", content=f"document number {i}:\ncontent", meta={
146+
"meta_field": f"meta_value {i}"})
146147
for i in range(5)
147148
]
148149

@@ -152,15 +153,16 @@ def test_prepare_texts_to_embed_w_metadata(self):
152153

153154
prepared_texts = embedder._prepare_texts_to_embed(documents)
154155
assert prepared_texts == [
155-
'meta_value 0 | document number 0:\ncontent',
156-
'meta_value 1 | document number 1:\ncontent',
157-
'meta_value 2 | document number 2:\ncontent',
158-
'meta_value 3 | document number 3:\ncontent',
159-
'meta_value 4 | document number 4:\ncontent'
156+
"meta_value 0 | document number 0:\ncontent",
157+
"meta_value 1 | document number 1:\ncontent",
158+
"meta_value 2 | document number 2:\ncontent",
159+
"meta_value 3 | document number 3:\ncontent",
160+
"meta_value 4 | document number 4:\ncontent"
160161
]
161162

162163
def test_run_wrong_input_format(self):
163-
embedder = GoogleGenAIDocumentEmbedder(api_key=Secret.from_token("fake-api-key"))
164+
embedder = GoogleGenAIDocumentEmbedder(
165+
api_key=Secret.from_token("fake-api-key"))
164166

165167
# wrong formats
166168
string_input = "text"
@@ -173,7 +175,8 @@ def test_run_wrong_input_format(self):
173175
embedder.run(documents=list_integers_input)
174176

175177
def test_run_on_empty_list(self):
176-
embedder = GoogleGenAIDocumentEmbedder(api_key=Secret.from_token("fake-api-key"))
178+
embedder = GoogleGenAIDocumentEmbedder(
179+
api_key=Secret.from_token("fake-api-key"))
177180

178181
empty_list_input = []
179182
result = embedder.run(documents=empty_list_input)
@@ -189,12 +192,14 @@ def test_run_on_empty_list(self):
189192
def test_run(self):
190193
docs = [
191194
Document(content="I love cheese", meta={"topic": "Cuisine"}),
192-
Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}),
195+
Document(content="A transformer is a deep learning architecture", meta={
196+
"topic": "ML"}),
193197
]
194198

195199
model = "text-embedding-004"
196200

197-
embedder = GoogleGenAIDocumentEmbedder(model=model, meta_fields_to_embed=["topic"], embedding_separator=" | ")
201+
embedder = GoogleGenAIDocumentEmbedder(model=model, meta_fields_to_embed=[
202+
"topic"], embedding_separator=" | ")
198203

199204
result = embedder.run(documents=docs)
200205
documents_with_embeddings = result["documents"]

0 commit comments

Comments
 (0)