diff --git a/integrations/optimum/LICENSE.txt b/integrations/optimum/LICENSE.txt index 137069b823..a0f3c3ec48 100644 --- a/integrations/optimum/LICENSE.txt +++ b/integrations/optimum/LICENSE.txt @@ -58,7 +58,7 @@ APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. -Copyright [yyyy] [name of copyright owner] +Copyright 2024 deepset GmbH Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/_backend.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/_backend.py index 0896210d86..2dadfe75e3 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/_backend.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/_backend.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + import copy import json from dataclasses import dataclass diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py index a065f796b5..956338c962 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from enum import Enum from typing import Any diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py index c399d6be41..727f0e7f56 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py @@ -1,3 +1,8 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import replace from typing import Any, Optional, Union from haystack import Document, component, default_from_dict, default_to_dict @@ -52,7 +57,7 @@ def __init__( progress_bar: bool = True, meta_fields_to_embed: Optional[list[str]] = None, embedding_separator: str = "\n", - ): + ) -> None: """ Create a OptimumDocumentEmbedder component. @@ -136,7 +141,7 @@ def __init__( self._backend = _EmbedderBackend(params) self._initialized = False - def warm_up(self): + def warm_up(self) -> None: """ Initializes the component. """ @@ -200,14 +205,12 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: A list of Documents to embed. :returns: The updated Documents with their embeddings. - :raises RuntimeError: - If the component was not initialized. :raises TypeError: If the input is not a list of Documents. """ if not self._initialized: - msg = "The embedding model has not been loaded. Please call warm_up() before running." - raise RuntimeError(msg) + self.warm_up() + if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)): msg = ( "OptimumDocumentEmbedder expects a list of Documents as input." @@ -221,7 +224,9 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: texts_to_embed = self._prepare_texts_to_embed(documents=documents) embeddings = self._backend.embed_texts(texts_to_embed) + + new_documents = [] for doc, emb in zip(documents, embeddings): - doc.embedding = emb + new_documents.append(replace(doc, embedding=emb)) - return {"documents": documents} + return {"documents": new_documents} diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py index 2ae77b20a7..2b325c89ef 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Optional, Union from haystack import component, default_from_dict, default_to_dict @@ -162,14 +166,11 @@ def run(self, text: str) -> dict[str, list[float]]: The text to embed. :returns: The embeddings of the text. - :raises RuntimeError: - If the component was not initialized. :raises TypeError: If the input is not a string. """ if not self._initialized: - msg = "The embedding model has not been loaded. Please call warm_up() before running." - raise RuntimeError(msg) + self.warm_up() if not isinstance(text, str): msg = ( diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py index 2c8bbd9678..5e88a29835 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + from enum import Enum diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py index fd2c484dc3..ebced48d5e 100644 --- a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py +++ b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from enum import Enum from typing import Any diff --git a/integrations/optimum/tests/test_optimum_document_embedder.py b/integrations/optimum/tests/test_optimum_document_embedder.py index 5ec10ef2cd..4a34092bd7 100644 --- a/integrations/optimum/tests/test_optimum_document_embedder.py +++ b/integrations/optimum/tests/test_optimum_document_embedder.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + import copy import tempfile from unittest.mock import MagicMock, patch @@ -371,7 +375,6 @@ def test_run(self, opt_config, quant_config): optimizer_settings=opt_config, quantizer_settings=quant_config, ) - embedder.warm_up() result = embedder.run(documents=docs) _ = [embedder.run([d]) for d in docs_copy] diff --git a/integrations/optimum/tests/test_optimum_text_embedder.py b/integrations/optimum/tests/test_optimum_text_embedder.py index 24f460281a..f0700e52a7 100644 --- a/integrations/optimum/tests/test_optimum_text_embedder.py +++ b/integrations/optimum/tests/test_optimum_text_embedder.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + from unittest.mock import MagicMock, patch import pytest @@ -252,7 +256,6 @@ def test_run(self): suffix=" suffix", pooling_mode=pooling_mode, ) - embedder.warm_up() result = embedder.run(text="The food was delicious")