Skip to content

Commit cbc0ae1

Browse files
authored
feat: Update Optimum components to auto call run warm_up and don't modify Documents in place (#2675)
* Auto call warm_up * Add mising license headers * Dont' modify docs in place
1 parent bbd4a18 commit cbc0ae1

9 files changed

Lines changed: 43 additions & 15 deletions

File tree

integrations/optimum/LICENSE.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ APPENDIX: How to apply the Apache License to your work.
5858

5959
To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives.
6060

61-
Copyright [yyyy] [name of copyright owner]
61+
Copyright 2024 deepset GmbH
6262

6363
Licensed under the Apache License, Version 2.0 (the "License");
6464
you may not use this file except in compliance with the License.

integrations/optimum/src/haystack_integrations/components/embedders/optimum/_backend.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# SPDX-FileCopyrightText: 2024-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
15
import copy
26
import json
37
from dataclasses import dataclass

integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# SPDX-FileCopyrightText: 2024-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
15
from dataclasses import dataclass
26
from enum import Enum
37
from typing import Any

integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
# SPDX-FileCopyrightText: 2024-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from dataclasses import replace
16
from typing import Any, Optional, Union
27

38
from haystack import Document, component, default_from_dict, default_to_dict
@@ -52,7 +57,7 @@ def __init__(
5257
progress_bar: bool = True,
5358
meta_fields_to_embed: Optional[list[str]] = None,
5459
embedding_separator: str = "\n",
55-
):
60+
) -> None:
5661
"""
5762
Create a OptimumDocumentEmbedder component.
5863
@@ -136,7 +141,7 @@ def __init__(
136141
self._backend = _EmbedderBackend(params)
137142
self._initialized = False
138143

139-
def warm_up(self):
144+
def warm_up(self) -> None:
140145
"""
141146
Initializes the component.
142147
"""
@@ -200,14 +205,12 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
200205
A list of Documents to embed.
201206
:returns:
202207
The updated Documents with their embeddings.
203-
:raises RuntimeError:
204-
If the component was not initialized.
205208
:raises TypeError:
206209
If the input is not a list of Documents.
207210
"""
208211
if not self._initialized:
209-
msg = "The embedding model has not been loaded. Please call warm_up() before running."
210-
raise RuntimeError(msg)
212+
self.warm_up()
213+
211214
if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
212215
msg = (
213216
"OptimumDocumentEmbedder expects a list of Documents as input."
@@ -221,7 +224,9 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
221224

222225
texts_to_embed = self._prepare_texts_to_embed(documents=documents)
223226
embeddings = self._backend.embed_texts(texts_to_embed)
227+
228+
new_documents = []
224229
for doc, emb in zip(documents, embeddings):
225-
doc.embedding = emb
230+
new_documents.append(replace(doc, embedding=emb))
226231

227-
return {"documents": documents}
232+
return {"documents": new_documents}

integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# SPDX-FileCopyrightText: 2024-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
15
from typing import Any, Optional, Union
26

37
from haystack import component, default_from_dict, default_to_dict
@@ -162,14 +166,11 @@ def run(self, text: str) -> dict[str, list[float]]:
162166
The text to embed.
163167
:returns:
164168
The embeddings of the text.
165-
:raises RuntimeError:
166-
If the component was not initialized.
167169
:raises TypeError:
168170
If the input is not a string.
169171
"""
170172
if not self._initialized:
171-
msg = "The embedding model has not been loaded. Please call warm_up() before running."
172-
raise RuntimeError(msg)
173+
self.warm_up()
173174

174175
if not isinstance(text, str):
175176
msg = (

integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# SPDX-FileCopyrightText: 2024-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
15
from enum import Enum
26

37

integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# SPDX-FileCopyrightText: 2024-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
15
from dataclasses import dataclass
26
from enum import Enum
37
from typing import Any

integrations/optimum/tests/test_optimum_document_embedder.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# SPDX-FileCopyrightText: 2024-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
15
import copy
26
import tempfile
37
from unittest.mock import MagicMock, patch
@@ -371,7 +375,6 @@ def test_run(self, opt_config, quant_config):
371375
optimizer_settings=opt_config,
372376
quantizer_settings=quant_config,
373377
)
374-
embedder.warm_up()
375378

376379
result = embedder.run(documents=docs)
377380
_ = [embedder.run([d]) for d in docs_copy]

integrations/optimum/tests/test_optimum_text_embedder.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# SPDX-FileCopyrightText: 2024-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
15
from unittest.mock import MagicMock, patch
26

37
import pytest
@@ -252,7 +256,6 @@ def test_run(self):
252256
suffix=" suffix",
253257
pooling_mode=pooling_mode,
254258
)
255-
embedder.warm_up()
256259

257260
result = embedder.run(text="The food was delicious")
258261

0 commit comments

Comments
 (0)