diff --git a/integrations/fastembed/LICENSE.txt b/integrations/fastembed/LICENSE.txt index 137069b823..a0f3c3ec48 100644 --- a/integrations/fastembed/LICENSE.txt +++ b/integrations/fastembed/LICENSE.txt @@ -58,7 +58,7 @@ APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. -Copyright [yyyy] [name of copyright owner] +Copyright 2024 deepset GmbH Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py index d73c297663..50c5297ea8 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/__init__.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: 2024-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 + from .fastembed_document_embedder import FastembedDocumentEmbedder from .fastembed_sparse_document_embedder import FastembedSparseDocumentEmbedder from .fastembed_sparse_text_embedder import FastembedSparseTextEmbedder diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py index bcd1a6111d..37c5184667 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_document_embedder.py @@ -1,8 +1,13 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import replace from typing import Any, Optional from haystack import Document, component, default_to_dict -from .embedding_backend.fastembed_backend import _FastembedEmbeddingBackendFactory +from .embedding_backend.fastembed_backend import _FastembedEmbeddingBackend, _FastembedEmbeddingBackendFactory @component @@ -68,7 +73,7 @@ def __init__( local_files_only: bool = False, meta_fields_to_embed: Optional[list[str]] = None, embedding_separator: str = "\n", - ): + ) -> None: """ Create an FastembedDocumentEmbedder component. @@ -102,6 +107,7 @@ def __init__( self.local_files_only = local_files_only self.meta_fields_to_embed = meta_fields_to_embed or [] self.embedding_separator = embedding_separator + self.embedding_backend: Optional[_FastembedEmbeddingBackend] = None def to_dict(self) -> dict[str, Any]: """ @@ -124,11 +130,11 @@ def to_dict(self) -> dict[str, Any]: embedding_separator=self.embedding_separator, ) - def warm_up(self): + def warm_up(self) -> None: """ Initializes the component. """ - if not hasattr(self, "embedding_backend"): + if self.embedding_backend is None: self.embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend( model_name=self.model_name, cache_dir=self.cache_dir, @@ -157,6 +163,7 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: :param documents: List of Documents to embed. :returns: A dictionary with the following keys: - `documents`: List of Documents with each Document's `embedding` field set to the computed embeddings. + :raises TypeError: If the input is not a list of Documents. """ if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)): msg = ( @@ -164,19 +171,20 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: "In case you want to embed a list of strings, please use the FastembedTextEmbedder." ) raise TypeError(msg) - if not hasattr(self, "embedding_backend"): - msg = "The embedding model has not been loaded. Please call warm_up() before running." - raise RuntimeError(msg) + + if self.embedding_backend is None: + self.warm_up() texts_to_embed = self._prepare_texts_to_embed(documents=documents) - embeddings = self.embedding_backend.embed( + embeddings = self.embedding_backend.embed( # type: ignore[union-attr] texts_to_embed, batch_size=self.batch_size, progress_bar=self.progress_bar, parallel=self.parallel, ) + new_documents = [] for doc, emb in zip(documents, embeddings): - doc.embedding = emb + new_documents.append(replace(doc, embedding=emb)) - return {"documents": documents} + return {"documents": new_documents} diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py index 40137b4ea9..62729e814a 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_document_embedder.py @@ -1,8 +1,16 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import replace from typing import Any, Optional from haystack import Document, component, default_to_dict -from .embedding_backend.fastembed_backend import _FastembedSparseEmbeddingBackendFactory +from .embedding_backend.fastembed_backend import ( + _FastembedSparseEmbeddingBackend, + _FastembedSparseEmbeddingBackendFactory, +) @component @@ -63,7 +71,7 @@ def __init__( meta_fields_to_embed: Optional[list[str]] = None, embedding_separator: str = "\n", model_kwargs: Optional[dict[str, Any]] = None, - ): + ) -> None: """ Create an FastembedDocumentEmbedder component. @@ -95,6 +103,7 @@ def __init__( self.meta_fields_to_embed = meta_fields_to_embed or [] self.embedding_separator = embedding_separator self.model_kwargs = model_kwargs + self.embedding_backend: Optional[_FastembedSparseEmbeddingBackend] = None def to_dict(self) -> dict[str, Any]: """ @@ -116,11 +125,11 @@ def to_dict(self) -> dict[str, Any]: model_kwargs=self.model_kwargs, ) - def warm_up(self): + def warm_up(self) -> None: """ Initializes the component. """ - if not hasattr(self, "embedding_backend"): + if self.embedding_backend is None: self.embedding_backend = _FastembedSparseEmbeddingBackendFactory.get_embedding_backend( model_name=self.model_name, cache_dir=self.cache_dir, @@ -149,6 +158,7 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: :returns: A dictionary with the following keys: - `documents`: List of Documents with each Document's `sparse_embedding` field set to the computed embeddings. + :raises TypeError: If the input is not a list of Documents. """ if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)): msg = ( @@ -156,18 +166,20 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: "In case you want to embed a list of strings, please use the FastembedTextEmbedder." ) raise TypeError(msg) - if not hasattr(self, "embedding_backend"): - msg = "The embedding model has not been loaded. Please call warm_up() before running." - raise RuntimeError(msg) + + if self.embedding_backend is None: + self.warm_up() texts_to_embed = self._prepare_texts_to_embed(documents=documents) - embeddings = self.embedding_backend.embed( + embeddings = self.embedding_backend.embed( # type: ignore[union-attr] texts_to_embed, batch_size=self.batch_size, progress_bar=self.progress_bar, parallel=self.parallel, ) + new_documents = [] for doc, emb in zip(documents, embeddings): - doc.sparse_embedding = emb - return {"documents": documents} + new_documents.append(replace(doc, sparse_embedding=emb)) + + return {"documents": new_documents} diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py index cac95f697a..6d077435de 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_sparse_text_embedder.py @@ -1,9 +1,16 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Optional from haystack import component, default_to_dict from haystack.dataclasses.sparse_embedding import SparseEmbedding -from .embedding_backend.fastembed_backend import _FastembedSparseEmbeddingBackendFactory +from .embedding_backend.fastembed_backend import ( + _FastembedSparseEmbeddingBackend, + _FastembedSparseEmbeddingBackendFactory, +) @component @@ -36,7 +43,7 @@ def __init__( parallel: Optional[int] = None, local_files_only: bool = False, model_kwargs: Optional[dict[str, Any]] = None, - ): + ) -> None: """ Create a FastembedSparseTextEmbedder component. @@ -61,6 +68,7 @@ def __init__( self.parallel = parallel self.local_files_only = local_files_only self.model_kwargs = model_kwargs + self.embedding_backend: Optional[_FastembedSparseEmbeddingBackend] = None def to_dict(self) -> dict[str, Any]: """ @@ -80,11 +88,11 @@ def to_dict(self) -> dict[str, Any]: model_kwargs=self.model_kwargs, ) - def warm_up(self): + def warm_up(self) -> None: """ Initializes the component. """ - if not hasattr(self, "embedding_backend"): + if self.embedding_backend is None: self.embedding_backend = _FastembedSparseEmbeddingBackendFactory.get_embedding_backend( model_name=self.model_name, cache_dir=self.cache_dir, @@ -102,7 +110,6 @@ def run(self, text: str) -> dict[str, SparseEmbedding]: :returns: A dictionary with the following keys: - `embedding`: A list of floats representing the embedding of the input text. :raises TypeError: If the input is not a string. - :raises RuntimeError: If the embedding model has not been loaded. """ if not isinstance(text, str): msg = ( @@ -110,11 +117,11 @@ def run(self, text: str) -> dict[str, SparseEmbedding]: "In case you want to embed a list of Documents, please use the FastembedDocumentEmbedder." ) raise TypeError(msg) - if not hasattr(self, "embedding_backend"): - msg = "The embedding model has not been loaded. Please call warm_up() before running." - raise RuntimeError(msg) - embedding = self.embedding_backend.embed( + if self.embedding_backend is None: + self.warm_up() + + embedding = self.embedding_backend.embed( # type: ignore[union-attr] [text], progress_bar=self.progress_bar, parallel=self.parallel, diff --git a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py index 0c6bb646f3..07daf85dcb 100644 --- a/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py +++ b/integrations/fastembed/src/haystack_integrations/components/embedders/fastembed/fastembed_text_embedder.py @@ -1,8 +1,12 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Optional from haystack import component, default_to_dict -from .embedding_backend.fastembed_backend import _FastembedEmbeddingBackendFactory +from .embedding_backend.fastembed_backend import _FastembedEmbeddingBackend, _FastembedEmbeddingBackendFactory @component @@ -36,7 +40,7 @@ def __init__( progress_bar: bool = True, parallel: Optional[int] = None, local_files_only: bool = False, - ): + ) -> None: """ Create a FastembedTextEmbedder component. @@ -63,6 +67,7 @@ def __init__( self.progress_bar = progress_bar self.parallel = parallel self.local_files_only = local_files_only + self.embedding_backend: Optional[_FastembedEmbeddingBackend] = None def to_dict(self) -> dict[str, Any]: """ @@ -83,11 +88,11 @@ def to_dict(self) -> dict[str, Any]: local_files_only=self.local_files_only, ) - def warm_up(self): + def warm_up(self) -> None: """ Initializes the component. """ - if not hasattr(self, "embedding_backend"): + if self.embedding_backend is None: self.embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend( model_name=self.model_name, cache_dir=self.cache_dir, @@ -104,7 +109,6 @@ def run(self, text: str) -> dict[str, list[float]]: :returns: A dictionary with the following keys: - `embedding`: A list of floats representing the embedding of the input text. :raises TypeError: If the input is not a string. - :raises RuntimeError: If the embedding model has not been loaded. """ if not isinstance(text, str): msg = ( @@ -112,13 +116,13 @@ def run(self, text: str) -> dict[str, list[float]]: "In case you want to embed a list of Documents, please use the FastembedDocumentEmbedder." ) raise TypeError(msg) - if not hasattr(self, "embedding_backend"): - msg = "The embedding model has not been loaded. Please call warm_up() before running." - raise RuntimeError(msg) + + if self.embedding_backend is None: + self.warm_up() text_to_embed = [self.prefix + text + self.suffix] embedding = list( - self.embedding_backend.embed( + self.embedding_backend.embed( # type: ignore[union-attr] text_to_embed, progress_bar=self.progress_bar, parallel=self.parallel, diff --git a/integrations/fastembed/src/haystack_integrations/components/rankers/fastembed/__init__.py b/integrations/fastembed/src/haystack_integrations/components/rankers/fastembed/__init__.py index ece5e858b9..318a80ddd3 100644 --- a/integrations/fastembed/src/haystack_integrations/components/rankers/fastembed/__init__.py +++ b/integrations/fastembed/src/haystack_integrations/components/rankers/fastembed/__init__.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + from .ranker import FastembedRanker __all__ = ["FastembedRanker"] diff --git a/integrations/fastembed/src/haystack_integrations/components/rankers/fastembed/ranker.py b/integrations/fastembed/src/haystack_integrations/components/rankers/fastembed/ranker.py index 9b55fbb4e8..8726c1b637 100644 --- a/integrations/fastembed/src/haystack_integrations/components/rankers/fastembed/ranker.py +++ b/integrations/fastembed/src/haystack_integrations/components/rankers/fastembed/ranker.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Optional from haystack import Document, component, default_from_dict, default_to_dict, logging diff --git a/integrations/fastembed/tests/test_fastembed_backend.py b/integrations/fastembed/tests/test_fastembed_backend.py index 994a6f8835..f3567e56a7 100644 --- a/integrations/fastembed/tests/test_fastembed_backend.py +++ b/integrations/fastembed/tests/test_fastembed_backend.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + from unittest.mock import patch from haystack_integrations.components.embedders.fastembed.embedding_backend.fastembed_backend import ( diff --git a/integrations/fastembed/tests/test_fastembed_document_embedder.py b/integrations/fastembed/tests/test_fastembed_document_embedder.py index ad22ab1e97..4aad6268f5 100644 --- a/integrations/fastembed/tests/test_fastembed_document_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_document_embedder.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + from unittest.mock import MagicMock, patch import numpy as np diff --git a/integrations/fastembed/tests/test_fastembed_ranker.py b/integrations/fastembed/tests/test_fastembed_ranker.py index a5e72536ca..1b1de7ed67 100644 --- a/integrations/fastembed/tests/test_fastembed_ranker.py +++ b/integrations/fastembed/tests/test_fastembed_ranker.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + from unittest.mock import MagicMock import pytest @@ -211,20 +215,6 @@ def test_run_incorrect_input_format(self): ): ranker.run(query=query, documents=list_document, top_k=-3) - def test_run_no_warmup(self): - """ - Test for checking error when calling without a warmup. - """ - ranker = FastembedRanker(model_name="Xenova/ms-marco-MiniLM-L-12-v2") - - query = "query" - list_document = [Document("Document 1")] - - with pytest.raises( - RuntimeError, - ): - ranker.run(query=query, documents=list_document) - def test_run_empty_document_list(self): """ Test for no error when sending no documents. diff --git a/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py b/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py index 7f8d5faee8..59ea9cd249 100644 --- a/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_sparse_document_embedder.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + from unittest.mock import MagicMock, patch import numpy as np diff --git a/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py b/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py index 9b73f5f3ab..c9e3f77130 100644 --- a/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_sparse_text_embedder.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + from unittest.mock import MagicMock, patch import numpy as np diff --git a/integrations/fastembed/tests/test_fastembed_text_embedder.py b/integrations/fastembed/tests/test_fastembed_text_embedder.py index f1b2e21e90..da969dffa8 100644 --- a/integrations/fastembed/tests/test_fastembed_text_embedder.py +++ b/integrations/fastembed/tests/test_fastembed_text_embedder.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2024-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + from unittest.mock import MagicMock, patch import numpy as np