feat: add FAISSEmbeddingRetriever component

GunaPalanivel · GunaPalanivel · commit f186c9c8d58f · 2026-02-26T16:34:49.000+05:30
- Add components/retrievers/faiss/embedding_retriever.py with @component decorator, run(), run_async(), to_dict(), from_dict() with FilterPolicy support and backward-compat deserialization guard - Add components/__init__.py, components/retrievers/__init__.py, components/retrievers/faiss/__init__.py namespace packages - Add tests/test_embedding_retriever.py with 8 tests covering: basic run, runtime filters, top_k override, to_dict/from_dict roundtrip, FilterPolicy REPLACE/MERGE, ValueError on wrong store type, and end-to-end pipeline execution - Update pyproject.toml types script to also typecheck haystack_integrations.components.retrievers.faiss
diff --git a/integrations/faiss/pyproject.toml b/integrations/faiss/pyproject.toml
@@ -67,7 +67,7 @@ integration = 'pytest -m "integration" {args:tests}'
 all = 'pytest {args:tests}'
 cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x {args:tests}'
 
-types = "mypy -p haystack_integrations.document_stores.faiss {args}"
+types = "mypy -p haystack_integrations.document_stores.faiss -p haystack_integrations.components.retrievers.faiss {args}"
 
 [tool.mypy]
 install_types = true
diff --git a/integrations/faiss/src/haystack_integrations/components/__init__.py b/integrations/faiss/src/haystack_integrations/components/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/integrations/faiss/src/haystack_integrations/components/retrievers/__init__.py b/integrations/faiss/src/haystack_integrations/components/retrievers/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/integrations/faiss/src/haystack_integrations/components/retrievers/faiss/__init__.py b/integrations/faiss/src/haystack_integrations/components/retrievers/faiss/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+from .embedding_retriever import FAISSEmbeddingRetriever
+
+__all__ = ["FAISSEmbeddingRetriever"]
diff --git a/integrations/faiss/src/haystack_integrations/components/retrievers/faiss/embedding_retriever.py b/integrations/faiss/src/haystack_integrations/components/retrievers/faiss/embedding_retriever.py
@@ -0,0 +1,156 @@
+# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+from haystack import component, default_from_dict, default_to_dict
+from haystack.dataclasses import Document
+from haystack.document_stores.types import FilterPolicy
+from haystack.document_stores.types.filter_policy import apply_filter_policy
+
+from haystack_integrations.document_stores.faiss import FAISSDocumentStore
+
+
+@component
+class FAISSEmbeddingRetriever:
+    """
+    Retrieves documents from the `FAISSDocumentStore`, based on their dense embeddings.
+
+    Example usage:
+    ```python
+    from haystack import Document, Pipeline
+    from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
+    from haystack.document_stores.types import DuplicatePolicy
+
+    from haystack_integrations.document_stores.faiss import FAISSDocumentStore
+    from haystack_integrations.components.retrievers.faiss import FAISSEmbeddingRetriever
+
+    document_store = FAISSDocumentStore(embedding_dim=768)
+
+    documents = [
+        Document(content="There are over 7,000 languages spoken around the world today."),
+        Document(content="Elephants have been observed to behave in a way that indicates a high level of intelligence."),
+        Document(content="In certain places, you can witness the phenomenon of bioluminescent waves."),
+    ]
+
+    document_embedder = SentenceTransformersDocumentEmbedder()
+    document_embedder.warm_up()
+    documents_with_embeddings = document_embedder.run(documents)["documents"]
+
+    document_store.write_documents(documents_with_embeddings, policy=DuplicatePolicy.OVERWRITE)
+
+    query_pipeline = Pipeline()
+    query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder())
+    query_pipeline.add_component("retriever", FAISSEmbeddingRetriever(document_store=document_store))
+    query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
+
+    query = "How many languages are there?"
+    res = query_pipeline.run({"text_embedder": {"text": query}})
+
+    assert res["retriever"]["documents"][0].content == "There are over 7,000 languages spoken around the world today."
+    ```
+    """
+
+    def __init__(
+        self,
+        *,
+        document_store: FAISSDocumentStore,
+        filters: dict[str, Any] | None = None,
+        top_k: int = 10,
+        filter_policy: str | FilterPolicy = FilterPolicy.REPLACE,
+    ):
+        """
+        :param document_store: An instance of `FAISSDocumentStore`.
+        :param filters: Filters applied to the retrieved Documents at initialisation time. At runtime, these are merged
+            with any runtime filters according to the `filter_policy`.
+        :param top_k: Maximum number of Documents to return.
+        :param filter_policy: Policy to determine how init-time and runtime filters are combined.
+            See `FilterPolicy` for details. Defaults to `FilterPolicy.REPLACE`.
+        :raises ValueError: If `document_store` is not an instance of `FAISSDocumentStore`.
+        """
+        if not isinstance(document_store, FAISSDocumentStore):
+            msg = "document_store must be an instance of FAISSDocumentStore"
+            raise ValueError(msg)
+
+        self.document_store = document_store
+        self.filters = filters or {}
+        self.top_k = top_k
+        self.filter_policy = (
+            filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy)
+        )
+
+    def to_dict(self) -> dict[str, Any]:
+        """
+        Serializes the component to a dictionary.
+
+        :returns: Dictionary with serialized data.
+        """
+        return default_to_dict(
+            self,
+            filters=self.filters,
+            top_k=self.top_k,
+            filter_policy=self.filter_policy.value,
+            document_store=self.document_store.to_dict(),
+        )
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "FAISSEmbeddingRetriever":
+        """
+        Deserializes the component from a dictionary.
+
+        :param data: Dictionary to deserialize from.
+        :returns: Deserialized component.
+        """
+        doc_store_params = data["init_parameters"]["document_store"]
+        data["init_parameters"]["document_store"] = FAISSDocumentStore.from_dict(doc_store_params)
+        # Pipelines serialized with old versions of the component might not
+        # have the filter_policy field.
+        if filter_policy := data["init_parameters"].get("filter_policy"):
+            data["init_parameters"]["filter_policy"] = FilterPolicy.from_str(filter_policy)
+        return default_from_dict(cls, data)
+
+    @component.output_types(documents=list[Document])
+    def run(
+        self,
+        query_embedding: list[float],
+        filters: dict[str, Any] | None = None,
+        top_k: int | None = None,
+    ) -> dict[str, list[Document]]:
+        """
+        Retrieve documents from the `FAISSDocumentStore`, based on their embeddings.
+
+        :param query_embedding: Embedding of the query.
+        :param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
+                        the `filter_policy` chosen at retriever initialization. See init method docstring for more
+                        details.
+        :param top_k: Maximum number of Documents to return. Overrides the value set at initialization.
+        :returns: A dictionary with the following keys:
+            - `documents`: List of `Document`s that are similar to `query_embedding`.
+        """
+        filters = apply_filter_policy(self.filter_policy, self.filters, filters)
+        top_k = top_k or self.top_k
+        docs = self.document_store.search(query_embedding=query_embedding, top_k=top_k, filters=filters)
+        return {"documents": docs}
+
+    @component.output_types(documents=list[Document])
+    async def run_async(
+        self,
+        query_embedding: list[float],
+        filters: dict[str, Any] | None = None,
+        top_k: int | None = None,
+    ) -> dict[str, list[Document]]:
+        """
+        Asynchronously retrieve documents from the `FAISSDocumentStore`, based on their embeddings.
+
+        Since FAISS search is CPU-bound and fully in-memory, this delegates directly to the synchronous
+        `run()` method. No I/O or network calls are involved.
+
+        :param query_embedding: Embedding of the query.
+        :param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on
+                        the `filter_policy` chosen at retriever initialization. See init method docstring for more
+                        details.
+        :param top_k: Maximum number of Documents to return. Overrides the value set at initialization.
+        :returns: A dictionary with the following keys:
+            - `documents`: List of `Document`s that are similar to `query_embedding`.
+        """
+        return self.run(query_embedding=query_embedding, filters=filters, top_k=top_k)
diff --git a/integrations/faiss/tests/test_embedding_retriever.py b/integrations/faiss/tests/test_embedding_retriever.py
@@ -0,0 +1,132 @@
+# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+from haystack import Pipeline
+from haystack.dataclasses import Document
+from haystack.document_stores.types import FilterPolicy
+
+from haystack_integrations.components.retrievers.faiss import FAISSEmbeddingRetriever
+from haystack_integrations.document_stores.faiss import FAISSDocumentStore
+
+EMBEDDING_DIM = 3
+
+
+@pytest.fixture
+def document_store():
+    """In-memory FAISSDocumentStore with dim=3 for fast unit tests."""
+    return FAISSDocumentStore(embedding_dim=EMBEDDING_DIM)
+
+
+@pytest.fixture
+def populated_store(document_store):
+    """Store pre-loaded with 3 documents that have embeddings and metadata."""
+    docs = [
+        Document(content="alpha", embedding=[1.0, 0.0, 0.0], meta={"category": "A"}),
+        Document(content="beta", embedding=[0.0, 1.0, 0.0], meta={"category": "B"}),
+        Document(content="gamma", embedding=[0.0, 0.0, 1.0], meta={"category": "A"}),
+    ]
+    document_store.write_documents(docs)
+    return document_store
+
+
+class TestFAISSEmbeddingRetriever:
+    def test_run_with_query_embedding_only(self, populated_store):
+        retriever = FAISSEmbeddingRetriever(document_store=populated_store, top_k=2)
+        result = retriever.run(query_embedding=[1.0, 0.0, 0.0])
+
+        assert "documents" in result
+        assert isinstance(result["documents"], list)
+        assert len(result["documents"]) == 2
+        # All returned items must be Document instances
+        assert all(isinstance(d, Document) for d in result["documents"])
+
+    def test_run_with_filters(self, populated_store):
+        retriever = FAISSEmbeddingRetriever(document_store=populated_store, top_k=3)
+        filters = {"field": "meta.category", "operator": "==", "value": "A"}
+        result = retriever.run(query_embedding=[1.0, 0.0, 0.0], filters=filters)
+
+        assert "documents" in result
+        contents = [d.content for d in result["documents"]]
+        # Only category-A docs should be returned
+        assert all(d.meta["category"] == "A" for d in result["documents"])
+        assert "beta" not in contents
+
+    def test_run_with_top_k_override(self, populated_store):
+        retriever = FAISSEmbeddingRetriever(document_store=populated_store, top_k=3)
+        result = retriever.run(query_embedding=[1.0, 0.0, 0.0], top_k=1)
+
+        assert len(result["documents"]) == 1
+
+    def test_to_dict_from_dict_roundtrip(self, document_store):
+        retriever = FAISSEmbeddingRetriever(
+            document_store=document_store,
+            filters={"field": "meta.category", "operator": "==", "value": "A"},
+            top_k=5,
+            filter_policy=FilterPolicy.MERGE,
+        )
+
+        serialized = retriever.to_dict()
+        assert serialized["type"] == (
+            "haystack_integrations.components.retrievers.faiss.embedding_retriever.FAISSEmbeddingRetriever"
+        )
+        assert serialized["init_parameters"]["top_k"] == 5
+        assert serialized["init_parameters"]["filter_policy"] == FilterPolicy.MERGE.value
+        assert "document_store" in serialized["init_parameters"]
+
+        restored = FAISSEmbeddingRetriever.from_dict(serialized)
+        assert restored.top_k == 5
+        assert restored.filter_policy == FilterPolicy.MERGE
+        assert isinstance(restored.document_store, FAISSDocumentStore)
+
+    def test_filter_policy_replace(self, populated_store):
+        """REPLACE: runtime filters fully replace init-time filters."""
+        init_filters = {"field": "meta.category", "operator": "==", "value": "A"}
+        runtime_filters = {"field": "meta.category", "operator": "==", "value": "B"}
+
+        retriever = FAISSEmbeddingRetriever(
+            document_store=populated_store,
+            filters=init_filters,
+            top_k=3,
+            filter_policy=FilterPolicy.REPLACE,
+        )
+        result = retriever.run(query_embedding=[0.0, 1.0, 0.0], filters=runtime_filters)
+
+        # Only category B docs should appear — the init filter was replaced
+        assert all(d.meta["category"] == "B" for d in result["documents"])
+
+    def test_filter_policy_merge(self, populated_store):
+        """MERGE: runtime filters are merged with init-time filters."""
+        init_filters = {"field": "meta.category", "operator": "==", "value": "A"}
+
+        retriever = FAISSEmbeddingRetriever(
+            document_store=populated_store,
+            filters=init_filters,
+            top_k=3,
+            filter_policy=FilterPolicy.MERGE,
+        )
+        # Run without any runtime filter — init filter alone should apply
+        result = retriever.run(query_embedding=[1.0, 0.0, 0.0])
+
+        assert len(result["documents"]) >= 1
+        assert all(d.meta["category"] == "A" for d in result["documents"])
+
+    def test_invalid_document_store_type(self):
+        with pytest.raises(ValueError, match="document_store must be an instance of FAISSDocumentStore"):
+            FAISSEmbeddingRetriever(document_store="not_a_store")  # type: ignore[arg-type]
+
+    def test_run_in_pipeline(self, populated_store):
+        """End-to-end: FAISSEmbeddingRetriever wired into a Haystack Pipeline."""
+        retriever = FAISSEmbeddingRetriever(document_store=populated_store, top_k=2)
+
+        pipeline = Pipeline()
+        pipeline.add_component("retriever", retriever)
+
+        result = pipeline.run({"retriever": {"query_embedding": [1.0, 0.0, 0.0]}})
+
+        assert "retriever" in result
+        assert "documents" in result["retriever"]
+        assert isinstance(result["retriever"]["documents"], list)
+        assert len(result["retriever"]["documents"]) == 2
+        assert all(isinstance(d, Document) for d in result["retriever"]["documents"])

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>`
	`2`	`+#`
	`3`	`+# SPDX-License-Identifier: Apache-2.0`