Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion integrations/fastembed/LICENSE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ APPENDIX: How to apply the Apache License to your work.

To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives.

Copyright [yyyy] [name of copyright owner]
Copyright 2024 deepset GmbH

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SPDX-FileCopyrightText: 2024-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from .fastembed_document_embedder import FastembedDocumentEmbedder
from .fastembed_sparse_document_embedder import FastembedSparseDocumentEmbedder
from .fastembed_sparse_text_embedder import FastembedSparseTextEmbedder
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
# SPDX-FileCopyrightText: 2024-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from dataclasses import replace
from typing import Any, Optional

from haystack import Document, component, default_to_dict

from .embedding_backend.fastembed_backend import _FastembedEmbeddingBackendFactory
from .embedding_backend.fastembed_backend import _FastembedEmbeddingBackend, _FastembedEmbeddingBackendFactory


@component
Expand Down Expand Up @@ -68,7 +73,7 @@ def __init__(
local_files_only: bool = False,
meta_fields_to_embed: Optional[list[str]] = None,
embedding_separator: str = "\n",
):
) -> None:
"""
Create an FastembedDocumentEmbedder component.

Expand Down Expand Up @@ -102,6 +107,7 @@ def __init__(
self.local_files_only = local_files_only
self.meta_fields_to_embed = meta_fields_to_embed or []
self.embedding_separator = embedding_separator
self.embedding_backend: Optional[_FastembedEmbeddingBackend] = None

def to_dict(self) -> dict[str, Any]:
"""
Expand All @@ -124,11 +130,11 @@ def to_dict(self) -> dict[str, Any]:
embedding_separator=self.embedding_separator,
)

def warm_up(self):
def warm_up(self) -> None:
"""
Initializes the component.
"""
if not hasattr(self, "embedding_backend"):
if self.embedding_backend is None:
self.embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend(
model_name=self.model_name,
cache_dir=self.cache_dir,
Expand Down Expand Up @@ -157,26 +163,28 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
:param documents: List of Documents to embed.
:returns: A dictionary with the following keys:
- `documents`: List of Documents with each Document's `embedding` field set to the computed embeddings.
:raises TypeError: If the input is not a list of Documents.
"""
if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
msg = (
"FastembedDocumentEmbedder expects a list of Documents as input. "
"In case you want to embed a list of strings, please use the FastembedTextEmbedder."
)
raise TypeError(msg)
if not hasattr(self, "embedding_backend"):
msg = "The embedding model has not been loaded. Please call warm_up() before running."
raise RuntimeError(msg)

if self.embedding_backend is None:
self.warm_up()

texts_to_embed = self._prepare_texts_to_embed(documents=documents)
embeddings = self.embedding_backend.embed(
embeddings = self.embedding_backend.embed( # type: ignore[union-attr]
texts_to_embed,
batch_size=self.batch_size,
progress_bar=self.progress_bar,
parallel=self.parallel,
)

new_documents = []
for doc, emb in zip(documents, embeddings):
doc.embedding = emb
new_documents.append(replace(doc, embedding=emb))

return {"documents": documents}
return {"documents": new_documents}
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
# SPDX-FileCopyrightText: 2024-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from dataclasses import replace
from typing import Any, Optional

from haystack import Document, component, default_to_dict

from .embedding_backend.fastembed_backend import _FastembedSparseEmbeddingBackendFactory
from .embedding_backend.fastembed_backend import (
_FastembedSparseEmbeddingBackend,
_FastembedSparseEmbeddingBackendFactory,
)


@component
Expand Down Expand Up @@ -63,7 +71,7 @@ def __init__(
meta_fields_to_embed: Optional[list[str]] = None,
embedding_separator: str = "\n",
model_kwargs: Optional[dict[str, Any]] = None,
):
) -> None:
"""
Create an FastembedDocumentEmbedder component.

Expand Down Expand Up @@ -95,6 +103,7 @@ def __init__(
self.meta_fields_to_embed = meta_fields_to_embed or []
self.embedding_separator = embedding_separator
self.model_kwargs = model_kwargs
self.embedding_backend: Optional[_FastembedSparseEmbeddingBackend] = None

def to_dict(self) -> dict[str, Any]:
"""
Expand All @@ -116,11 +125,11 @@ def to_dict(self) -> dict[str, Any]:
model_kwargs=self.model_kwargs,
)

def warm_up(self):
def warm_up(self) -> None:
"""
Initializes the component.
"""
if not hasattr(self, "embedding_backend"):
if self.embedding_backend is None:
self.embedding_backend = _FastembedSparseEmbeddingBackendFactory.get_embedding_backend(
model_name=self.model_name,
cache_dir=self.cache_dir,
Expand Down Expand Up @@ -149,25 +158,28 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
:returns: A dictionary with the following keys:
- `documents`: List of Documents with each Document's `sparse_embedding`
field set to the computed embeddings.
:raises TypeError: If the input is not a list of Documents.
"""
if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
msg = (
"FastembedSparseDocumentEmbedder expects a list of Documents as input. "
"In case you want to embed a list of strings, please use the FastembedTextEmbedder."
)
raise TypeError(msg)
if not hasattr(self, "embedding_backend"):
msg = "The embedding model has not been loaded. Please call warm_up() before running."
raise RuntimeError(msg)

if self.embedding_backend is None:
self.warm_up()

texts_to_embed = self._prepare_texts_to_embed(documents=documents)
embeddings = self.embedding_backend.embed(
embeddings = self.embedding_backend.embed( # type: ignore[union-attr]
texts_to_embed,
batch_size=self.batch_size,
progress_bar=self.progress_bar,
parallel=self.parallel,
)

new_documents = []
for doc, emb in zip(documents, embeddings):
doc.sparse_embedding = emb
return {"documents": documents}
new_documents.append(replace(doc, sparse_embedding=emb))

return {"documents": new_documents}
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
# SPDX-FileCopyrightText: 2024-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from typing import Any, Optional

from haystack import component, default_to_dict
from haystack.dataclasses.sparse_embedding import SparseEmbedding

from .embedding_backend.fastembed_backend import _FastembedSparseEmbeddingBackendFactory
from .embedding_backend.fastembed_backend import (
_FastembedSparseEmbeddingBackend,
_FastembedSparseEmbeddingBackendFactory,
)


@component
Expand Down Expand Up @@ -36,7 +43,7 @@ def __init__(
parallel: Optional[int] = None,
local_files_only: bool = False,
model_kwargs: Optional[dict[str, Any]] = None,
):
) -> None:
"""
Create a FastembedSparseTextEmbedder component.

Expand All @@ -61,6 +68,7 @@ def __init__(
self.parallel = parallel
self.local_files_only = local_files_only
self.model_kwargs = model_kwargs
self.embedding_backend: Optional[_FastembedSparseEmbeddingBackend] = None

def to_dict(self) -> dict[str, Any]:
"""
Expand All @@ -80,11 +88,11 @@ def to_dict(self) -> dict[str, Any]:
model_kwargs=self.model_kwargs,
)

def warm_up(self):
def warm_up(self) -> None:
"""
Initializes the component.
"""
if not hasattr(self, "embedding_backend"):
if self.embedding_backend is None:
self.embedding_backend = _FastembedSparseEmbeddingBackendFactory.get_embedding_backend(
model_name=self.model_name,
cache_dir=self.cache_dir,
Expand All @@ -102,19 +110,18 @@ def run(self, text: str) -> dict[str, SparseEmbedding]:
:returns: A dictionary with the following keys:
- `embedding`: A list of floats representing the embedding of the input text.
:raises TypeError: If the input is not a string.
:raises RuntimeError: If the embedding model has not been loaded.
"""
if not isinstance(text, str):
msg = (
"FastembedSparseTextEmbedder expects a string as input. "
"In case you want to embed a list of Documents, please use the FastembedDocumentEmbedder."
)
raise TypeError(msg)
if not hasattr(self, "embedding_backend"):
msg = "The embedding model has not been loaded. Please call warm_up() before running."
raise RuntimeError(msg)

embedding = self.embedding_backend.embed(
if self.embedding_backend is None:
self.warm_up()

embedding = self.embedding_backend.embed( # type: ignore[union-attr]
[text],
progress_bar=self.progress_bar,
parallel=self.parallel,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
# SPDX-FileCopyrightText: 2024-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from typing import Any, Optional

from haystack import component, default_to_dict

from .embedding_backend.fastembed_backend import _FastembedEmbeddingBackendFactory
from .embedding_backend.fastembed_backend import _FastembedEmbeddingBackend, _FastembedEmbeddingBackendFactory


@component
Expand Down Expand Up @@ -36,7 +40,7 @@ def __init__(
progress_bar: bool = True,
parallel: Optional[int] = None,
local_files_only: bool = False,
):
) -> None:
"""
Create a FastembedTextEmbedder component.

Expand All @@ -63,6 +67,7 @@ def __init__(
self.progress_bar = progress_bar
self.parallel = parallel
self.local_files_only = local_files_only
self.embedding_backend: Optional[_FastembedEmbeddingBackend] = None

def to_dict(self) -> dict[str, Any]:
"""
Expand All @@ -83,11 +88,11 @@ def to_dict(self) -> dict[str, Any]:
local_files_only=self.local_files_only,
)

def warm_up(self):
def warm_up(self) -> None:
"""
Initializes the component.
"""
if not hasattr(self, "embedding_backend"):
if self.embedding_backend is None:
self.embedding_backend = _FastembedEmbeddingBackendFactory.get_embedding_backend(
model_name=self.model_name,
cache_dir=self.cache_dir,
Expand All @@ -104,21 +109,20 @@ def run(self, text: str) -> dict[str, list[float]]:
:returns: A dictionary with the following keys:
- `embedding`: A list of floats representing the embedding of the input text.
:raises TypeError: If the input is not a string.
:raises RuntimeError: If the embedding model has not been loaded.
"""
if not isinstance(text, str):
msg = (
"FastembedTextEmbedder expects a string as input. "
"In case you want to embed a list of Documents, please use the FastembedDocumentEmbedder."
)
raise TypeError(msg)
if not hasattr(self, "embedding_backend"):
msg = "The embedding model has not been loaded. Please call warm_up() before running."
raise RuntimeError(msg)

if self.embedding_backend is None:
self.warm_up()

text_to_embed = [self.prefix + text + self.suffix]
embedding = list(
self.embedding_backend.embed(
self.embedding_backend.embed( # type: ignore[union-attr]
text_to_embed,
progress_bar=self.progress_bar,
parallel=self.parallel,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# SPDX-FileCopyrightText: 2024-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from .ranker import FastembedRanker

__all__ = ["FastembedRanker"]
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# SPDX-FileCopyrightText: 2024-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from typing import Any, Optional

from haystack import Document, component, default_from_dict, default_to_dict, logging
Expand Down
4 changes: 4 additions & 0 deletions integrations/fastembed/tests/test_fastembed_backend.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# SPDX-FileCopyrightText: 2024-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from unittest.mock import patch

from haystack_integrations.components.embedders.fastembed.embedding_backend.fastembed_backend import (
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# SPDX-FileCopyrightText: 2024-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from unittest.mock import MagicMock, patch

import numpy as np
Expand Down
18 changes: 4 additions & 14 deletions integrations/fastembed/tests/test_fastembed_ranker.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# SPDX-FileCopyrightText: 2024-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from unittest.mock import MagicMock

import pytest
Expand Down Expand Up @@ -211,20 +215,6 @@ def test_run_incorrect_input_format(self):
):
ranker.run(query=query, documents=list_document, top_k=-3)

def test_run_no_warmup(self):
"""
Test for checking error when calling without a warmup.
"""
ranker = FastembedRanker(model_name="Xenova/ms-marco-MiniLM-L-12-v2")

query = "query"
list_document = [Document("Document 1")]

with pytest.raises(
RuntimeError,
):
ranker.run(query=query, documents=list_document)

def test_run_empty_document_list(self):
"""
Test for no error when sending no documents.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# SPDX-FileCopyrightText: 2024-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from unittest.mock import MagicMock, patch

import numpy as np
Expand Down
Loading