Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions integrations/faiss/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# faiss-haystack

This package provides a [FAISS](https://github.com/facebookresearch/faiss) document store for [Haystack](https://github.com/deepset-ai/haystack).

## Installation

```bash
pip install faiss-haystack
```

## Usage

```python
from haystack_integrations.document_stores.faiss import FAISSDocumentStore

document_store = FAISSDocumentStore(index_path="my_index")
```
14 changes: 14 additions & 0 deletions integrations/faiss/pydoc/config_docusaurus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
loaders:
- modules:
- haystack_integrations.components.retrievers.faiss.embedding_retriever
- haystack_integrations.document_stores.faiss.document_store
search_path: [../src]
processors:
- type: filter
documented_only: true
skip_empty_modules: true
renderer:
description: FAISS integration for Haystack
id: integrations-faiss
filename: faiss.md
title: FAISS
159 changes: 159 additions & 0 deletions integrations/faiss/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
[build-system]
requires = ["hatchling", "hatch-vcs"]
build-backend = "hatchling.build"

[project]
name = "faiss-haystack"
dynamic = ["version"]
description = ''
readme = "README.md"
requires-python = ">=3.10"
license = "Apache-2.0"
keywords = []
authors = [{ name = "Deepset", email = "info@deepset.ai" }]
classifiers = [
"License :: OSI Approved :: Apache Software License",
"Development Status :: 4 - Beta",
"Programming Language :: Python",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]
dependencies = [
"haystack-ai>=2.24.0",
"faiss-cpu>=1.8.0",
"numpy",
]

[project.urls]
Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/faiss#readme"
Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues"
Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/faiss"

[tool.hatch.build.targets.wheel]
packages = ["src/haystack_integrations"]

[tool.hatch.version]
source = "vcs"
tag-pattern = 'integrations\/faiss-v(?P<version>.*)'

[tool.hatch.version.raw-options]
root = "../.."
git_describe_command = 'git describe --tags --match="integrations/faiss-v[0-9]*"'

[tool.hatch.envs.default]
installer = "uv"
dependencies = ["haystack-pydoc-tools", "ruff"]

[tool.hatch.envs.default.scripts]
docs = ["pydoc-markdown pydoc/config_docusaurus.yml"]
fmt = "ruff check --fix {args}; ruff format {args}"
fmt-check = "ruff check {args} && ruff format --check {args}"

[tool.hatch.envs.test]
dependencies = [
"pytest",
"pytest-cov",
"pytest-rerunfailures",
"mypy",
"pandas",
]

[tool.hatch.envs.test.scripts]
unit = 'pytest -m "not integration" {args:tests}'
integration = 'pytest -m "integration" {args:tests}'
all = 'pytest {args:tests}'
cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x {args:tests}'

types = "mypy -p haystack_integrations.document_stores.faiss -p haystack_integrations.components.retrievers.faiss {args}"

[tool.mypy]
install_types = true
non_interactive = true
check_untyped_defs = true
disallow_incomplete_defs = true

[tool.hatch.metadata]
allow-direct-references = true

[tool.ruff]
line-length = 120

[tool.ruff.lint]
select = [
"A",
"ARG",
"B",
"C",
"DTZ",
"E",
"EM",
"F",
"FBT",
"I",
"ICN",
"ISC",
"N",
"PLC",
"PLE",
"PLR",
"PLW",
"Q",
"RUF",
"S",
"T",
"TID",
"UP",
"W",
"YTT",
]
ignore = [
# Allow non-abstract empty methods in abstract base classes
"B027",
# Allow boolean positional values in function calls, like `dict.get(... True)`
"FBT003",
# Ignore checks for possible passwords
"S105",
"S106",
"S107",
# Ignore complexity
"C901",
"PLR0911",
"PLR0912",
"PLR0913",
"PLR0915",
# Ignore unused params
"ARG002",
# Allow assertions
"S101",
]
exclude = ["example"]

[tool.ruff.lint.isort]
known-first-party = ["haystack_integrations"]

[tool.ruff.lint.flake8-tidy-imports]
ban-relative-imports = "parents"

[tool.ruff.lint.per-file-ignores]
# Tests can use magic values, assertions, and relative imports
"tests/**/*" = ["PLR2004", "S101", "TID252"]
"example/**/*" = ["T201"]

[tool.coverage.run]
source = ["haystack_integrations"]
branch = true
parallel = false


[tool.coverage.report]
omit = ["*/tests/*", "*/__init__.py"]
show_missing = true
exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]


[tool.pytest.ini_options]
minversion = "6.0"
markers = ["integration: integration tests"]
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
from .embedding_retriever import FAISSEmbeddingRetriever

__all__ = ["FAISSEmbeddingRetriever"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from typing import Any

from haystack import component, default_from_dict, default_to_dict
from haystack.dataclasses import Document
from haystack.document_stores.types import FilterPolicy
from haystack.document_stores.types.filter_policy import apply_filter_policy

from haystack_integrations.document_stores.faiss import FAISSDocumentStore


@component
class FAISSEmbeddingRetriever:
"""
Retrieves documents from the `FAISSDocumentStore`, based on their dense embeddings.

Example usage:
```python
from haystack import Document, Pipeline
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
from haystack.document_stores.types import DuplicatePolicy

from haystack_integrations.document_stores.faiss import FAISSDocumentStore
from haystack_integrations.components.retrievers.faiss import FAISSEmbeddingRetriever

document_store = FAISSDocumentStore(embedding_dim=768)

documents = [
Document(content="There are over 7,000 languages spoken around the world today."),
Document(content="Elephants have been observed to behave in a way that indicates a high level of intelligence."),
Document(content="In certain places, you can witness the phenomenon of bioluminescent waves."),
]

document_embedder = SentenceTransformersDocumentEmbedder()
document_embedder.warm_up()
documents_with_embeddings = document_embedder.run(documents)["documents"]

document_store.write_documents(documents_with_embeddings, policy=DuplicatePolicy.OVERWRITE)

query_pipeline = Pipeline()
query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder())
query_pipeline.add_component("retriever", FAISSEmbeddingRetriever(document_store=document_store))
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")

query = "How many languages are there?"
res = query_pipeline.run({"text_embedder": {"text": query}})

assert res["retriever"]["documents"][0].content == "There are over 7,000 languages spoken around the world today."
```
""" # noqa: E501

def __init__(
self,
*,
document_store: FAISSDocumentStore,
filters: dict[str, Any] | None = None,
top_k: int = 10,
filter_policy: str | FilterPolicy = FilterPolicy.REPLACE,
):
"""
:param document_store: An instance of `FAISSDocumentStore`.
:param filters: Filters applied to the retrieved Documents at initialisation time. At runtime, these are merged
with any runtime filters according to the `filter_policy`.
:param top_k: Maximum number of Documents to return.
:param filter_policy: Policy to determine how init-time and runtime filters are combined.
See `FilterPolicy` for details. Defaults to `FilterPolicy.REPLACE`.
:raises ValueError: If `document_store` is not an instance of `FAISSDocumentStore`.
"""
if not isinstance(document_store, FAISSDocumentStore):
msg = "document_store must be an instance of FAISSDocumentStore"
raise ValueError(msg)

self.document_store = document_store
self.filters = filters or {}
self.top_k = top_k
self.filter_policy = (
filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy)
)

def to_dict(self) -> dict[str, Any]:
"""
Serializes the component to a dictionary.

:returns: Dictionary with serialized data.
"""
return default_to_dict(
self,
filters=self.filters,
top_k=self.top_k,
filter_policy=self.filter_policy.value,
document_store=self.document_store.to_dict(),
)

@classmethod
def from_dict(cls, data: dict[str, Any]) -> "FAISSEmbeddingRetriever":
"""
Deserializes the component from a dictionary.

:param data: Dictionary to deserialize from.
:returns: Deserialized component.
"""
doc_store_params = data["init_parameters"]["document_store"]
data["init_parameters"]["document_store"] = FAISSDocumentStore.from_dict(doc_store_params)
return default_from_dict(cls, data)

@component.output_types(documents=list[Document])
def run(
self,
query_embedding: list[float],
filters: dict[str, Any] | None = None,
top_k: int | None = None,
) -> dict[str, list[Document]]:
"""
Retrieve documents from the `FAISSDocumentStore`, based on their embeddings.

:param query_embedding: Embedding of the query.
:param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
the `filter_policy` chosen at retriever initialization. See init method docstring for more
details.
:param top_k: Maximum number of Documents to return. Overrides the value set at initialization.
:returns: A dictionary with the following keys:
- `documents`: List of `Document`s that are similar to `query_embedding`.
"""
filters = apply_filter_policy(self.filter_policy, self.filters, filters)
top_k = top_k or self.top_k
docs = self.document_store.search(query_embedding=query_embedding, top_k=top_k, filters=filters)
return {"documents": docs}

@component.output_types(documents=list[Document])
async def run_async(
self,
query_embedding: list[float],
filters: dict[str, Any] | None = None,
top_k: int | None = None,
) -> dict[str, list[Document]]:
"""
Asynchronously retrieve documents from the `FAISSDocumentStore`, based on their embeddings.

Since FAISS search is CPU-bound and fully in-memory, this delegates directly to the synchronous
`run()` method. No I/O or network calls are involved.

:param query_embedding: Embedding of the query.
:param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
the `filter_policy` chosen at retriever initialization. See init method docstring for more
details.
:param top_k: Maximum number of Documents to return. Overrides the value set at initialization.
:returns: A dictionary with the following keys:
- `documents`: List of `Document`s that are similar to `query_embedding`.
"""
return self.run(query_embedding=query_embedding, filters=filters, top_k=top_k)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .document_store import FAISSDocumentStore

__all__ = ["FAISSDocumentStore"]
Loading