diff --git a/docs-website/docs/pipeline-components/retrievers.mdx b/docs-website/docs/pipeline-components/retrievers.mdx index eef648fdff..a3aa2cc69c 100644 --- a/docs-website/docs/pipeline-components/retrievers.mdx +++ b/docs-website/docs/pipeline-components/retrievers.mdx @@ -154,7 +154,6 @@ For details on how to initialize and use a Retriever in a pipeline, see the docu | [InMemoryBM25Retriever](retrievers/inmemorybm25retriever.mdx) | A keyword-based Retriever compatible with the InMemoryDocumentStore. | | [InMemoryEmbeddingRetriever](retrievers/inmemoryembeddingretriever.mdx) | An embedding-based Retriever compatible with the InMemoryDocumentStore. | | [FilterRetriever](retrievers/filterretriever.mdx) | A special Retriever to be used with any Document Store to get the Documents that match specific filters. | -| [MultiFilterRetriever](retrievers/multifilterretriever.mdx) | A special Retriever to be used with any Document Store to retrieve Documents matching multiple filters in parallel. | | [MultiQueryEmbeddingRetriever](retrievers/multiqueryembeddingretriever.mdx) | Retrieves documents using multiple queries in parallel with an embedding-based Retriever. | | [MultiQueryTextRetriever](retrievers/multiquerytextretriever.mdx) | Retrieves documents using multiple queries in parallel with a text-based Retriever. | | [MongoDBAtlasEmbeddingRetriever](retrievers/mongodbatlasembeddingretriever.mdx) | An embedding Retriever compatible with the MongoDB Atlas Document Store. | diff --git a/docs-website/docs/pipeline-components/retrievers/multifilterretriever.mdx b/docs-website/docs/pipeline-components/retrievers/multifilterretriever.mdx deleted file mode 100644 index 38bffea3e1..0000000000 --- a/docs-website/docs/pipeline-components/retrievers/multifilterretriever.mdx +++ /dev/null @@ -1,125 +0,0 @@ ---- -title: "MultiFilterRetriever" -id: multifilterretriever -slug: "/multifilterretriever" -description: "Use this Retriever with any Document Store to retrieve Documents matching multiple filters in parallel." ---- - -# MultiFilterRetriever - -Use this Retriever with any Document Store to retrieve Documents matching multiple filters in parallel. - -
- -| | | -| --- | --- | -| **Most common position in a pipeline** | At the beginning of a Pipeline | -| **Mandatory init variables** | `document_store`: An instance of a Document Store | -| **Mandatory run variables** | `filters`: A list of filter dictionaries in the same syntax supported by the Document Stores | -| **Output variables** | `documents`: All the documents that match at least one of the provided filters, deduplicated | -| **API reference** | [Retrievers](/reference/retrievers-api) | -| **GitHub link** | https://github.com/deepset-ai/haystack/blob/main/haystack/components/retrievers/multi_filter_retriever.py | - -
- -## Overview - -`MultiFilterRetriever` is an extension of `FilterRetriever` that accepts a **list** of filter dictionaries and runs each filter against the Document Store **in parallel**. Results from all filters are merged and deduplicated before being returned. - -Use it when you need to retrieve Documents matching different criteria in a single pipeline step — for example, fetching English and German documents at the same time, or combining results from several independent filter conditions. - -Pay attention when using `MultiFilterRetriever` on a Document Store that contains many Documents, as each filter can return a large number of results. Passing an empty filter list returns no documents. - -`MultiFilterRetriever` does not score or rank Documents. If you need to rank the results by similarity to a query, consider using Ranker components after retrieval. - -## Usage - -### On its own - -```python -from haystack import Document -from haystack.components.retrievers import MultiFilterRetriever -from haystack.document_stores.in_memory import InMemoryDocumentStore -from haystack.components.writers import DocumentWriter -from haystack.document_stores.types import DuplicatePolicy - -documents = [ - Document(content="Python is a popular programming language", meta={"lang": "en"}), - Document(content="python ist eine beliebte Programmiersprache", meta={"lang": "de"}), -] - -document_store = InMemoryDocumentStore() -DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP).run(documents=documents) - -retriever = MultiFilterRetriever(document_store=document_store) - -filters = [ - {"field": "meta.lang", "operator": "==", "value": "en"}, - {"field": "meta.lang", "operator": "==", "value": "de"}, -] - -result = retriever.run(filters=filters) -for doc in result["documents"]: - print(doc.content) -``` - -### In a RAG pipeline - -Set your `OPENAI_API_KEY` as an environment variable and then run the following code: - -```python -import os - -from haystack import Document, Pipeline -from haystack.components.builders.prompt_builder import PromptBuilder -from haystack.components.generators import OpenAIGenerator -from haystack.components.retrievers import MultiFilterRetriever -from haystack.components.writers import DocumentWriter -from haystack.document_stores.in_memory import InMemoryDocumentStore -from haystack.document_stores.types import DuplicatePolicy - -document_store = InMemoryDocumentStore() -documents = [ - Document(content="Mark lives in Berlin.", meta={"year": 2018}), - Document(content="Mark lives in Paris.", meta={"year": 2021}), - Document(content="Mark is Danish.", meta={"year": 2021}), - Document(content="Mark lives in New York.", meta={"year": 2023}), -] -DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP).run(documents=documents) - -prompt_template = """ - Given these documents, answer the question.\nDocuments: - {% for doc in documents %} - {{ doc.content }} - {% endfor %} - - \nQuestion: {{question}} - \nAnswer: - """ - -rag_pipeline = Pipeline() -rag_pipeline.add_component(name="retriever", instance=MultiFilterRetriever(document_store=document_store)) -rag_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") -rag_pipeline.add_component(instance=OpenAIGenerator(), name="llm") -rag_pipeline.connect("retriever", "prompt_builder.documents") -rag_pipeline.connect("prompt_builder", "llm") - -result = rag_pipeline.run( - { - "retriever": { - "filters": [ - {"field": "meta.year", "operator": "==", "value": 2021}, - {"field": "meta.year", "operator": "==", "value": 2023}, - ] - }, - "prompt_builder": {"question": "Where does Mark live?"}, - } -) -print(result["llm"]["replies"][0]) -``` - -Here's an example output you might get: - -``` -According to the provided documents, Mark lives in New York. -``` diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 1365957d87..1acc95f5d9 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -533,7 +533,6 @@ export default { 'pipeline-components/retrievers/elasticsearchembeddingretriever', 'pipeline-components/retrievers/faissembeddingretriever', 'pipeline-components/retrievers/filterretriever', - 'pipeline-components/retrievers/multifilterretriever', 'pipeline-components/retrievers/inmemorybm25retriever', 'pipeline-components/retrievers/inmemoryembeddingretriever', 'pipeline-components/retrievers/mongodbatlasembeddingretriever', diff --git a/haystack/components/retrievers/__init__.py b/haystack/components/retrievers/__init__.py index 92404414e5..0eb2227822 100644 --- a/haystack/components/retrievers/__init__.py +++ b/haystack/components/retrievers/__init__.py @@ -11,7 +11,6 @@ "auto_merging_retriever": ["AutoMergingRetriever"], "filter_retriever": ["FilterRetriever"], "in_memory": ["InMemoryBM25Retriever", "InMemoryEmbeddingRetriever"], - "multi_filter_retriever": ["MultiFilterRetriever"], "multi_query_embedding_retriever": ["MultiQueryEmbeddingRetriever"], "multi_query_text_retriever": ["MultiQueryTextRetriever"], "sentence_window_retriever": ["SentenceWindowRetriever"], @@ -22,7 +21,6 @@ from .filter_retriever import FilterRetriever as FilterRetriever from .in_memory import InMemoryBM25Retriever as InMemoryBM25Retriever from .in_memory import InMemoryEmbeddingRetriever as InMemoryEmbeddingRetriever - from .multi_filter_retriever import MultiFilterRetriever as MultiFilterRetriever from .multi_query_embedding_retriever import MultiQueryEmbeddingRetriever as MultiQueryEmbeddingRetriever from .multi_query_text_retriever import MultiQueryTextRetriever as MultiQueryTextRetriever from .sentence_window_retriever import SentenceWindowRetriever as SentenceWindowRetriever diff --git a/haystack/components/retrievers/multi_filter_retriever.py b/haystack/components/retrievers/multi_filter_retriever.py deleted file mode 100644 index 5e5fec65eb..0000000000 --- a/haystack/components/retrievers/multi_filter_retriever.py +++ /dev/null @@ -1,91 +0,0 @@ -# SPDX-FileCopyrightText: 2022-present deepset GmbH -# -# SPDX-License-Identifier: Apache-2.0 - -from concurrent.futures import ThreadPoolExecutor -from typing import Any - -from haystack import Document, component -from haystack.components.retrievers.filter_retriever import FilterRetriever -from haystack.document_stores.types import DocumentStore -from haystack.utils.misc import _deduplicate_documents - - -@component -class MultiFilterRetriever: - """ - A component that retrieves documents using multiple filters in parallel. - - This component takes a list of filter dictionaries and retrieves matching documents for each filter set - in parallel. - - ### Usage example - - ```python - from haystack import Document - from haystack.components.retrievers import MultiFilterRetriever - from haystack.document_stores.in_memory import InMemoryDocumentStore - from haystack.components.writers import DocumentWriter - from haystack.document_stores.types import DuplicatePolicy - - documents = [ - Document(content="Python is a popular programming language", meta={"lang": "en"}), - Document(content="python ist eine beliebte Programmiersprache", meta={"lang": "de"}), - ] - - document_store = InMemoryDocumentStore() - writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP) - writer.run(documents=documents) - - multi_filter_retriever = MultiFilterRetriever(document_store=document_store) - - filters = [ - {"field": "meta.lang", "operator": "==", "value": "en"}, - {"field": "meta.lang", "operator": "==", "value": "de"}, - ] - - result = multi_filter_retriever.run(filters=filters) - for doc in result["documents"]: - print(doc.content) - ``` - """ - - def __init__(self, document_store: DocumentStore, max_workers: int = 3) -> None: - """ - Initialize MultiFilterRetriever. - - :param document_store: The document store to retrieve documents from. - :param max_workers: Maximum number of worker threads for parallel processing. - """ - self.document_store = document_store - self.max_workers = max_workers - self._retriever = FilterRetriever(document_store=document_store) - - @component.output_types(documents=list[Document]) - def run(self, filters: list[dict[str, Any]]) -> dict[str, list[Document]]: - """ - Retrieve documents using multiple filters in parallel. - - :param filters: List of filter dictionaries to process. - :returns: - A dictionary containing: - - `documents`: List of retrieved documents. - """ - docs: list[Document] = [] - - with ThreadPoolExecutor(max_workers=self.max_workers) as executor: - filters_results = executor.map(self._run_on_thread, filters) - for result in filters_results: - if not result: - continue - docs.extend(result) - - docs = _deduplicate_documents(docs) - - return {"documents": docs} - - def _run_on_thread(self, filters: dict[str, Any]) -> list[Document] | None: - result = self._retriever.run(filters=filters) - if result and "documents" in result: - return result["documents"] - return None diff --git a/releasenotes/notes/feat-multi-filter-retriever-266283c61e693da3.yaml b/releasenotes/notes/feat-multi-filter-retriever-266283c61e693da3.yaml deleted file mode 100644 index 56f577d6a6..0000000000 --- a/releasenotes/notes/feat-multi-filter-retriever-266283c61e693da3.yaml +++ /dev/null @@ -1,19 +0,0 @@ ---- -features: - - | - Add support for ``MultiFilterRetriever``, a new retriever component that executes multiple filter - queries against a document store **in parallel** and returns a single, de-duplicated list of - documents. - - .. code-block:: python - - multi_filter_retriever = MultiFilterRetriever(document_store=document_store) - - filters = [ - {"field": "meta.lang", "operator": "==", "value": "en"}, - {"field": "meta.lang", "operator": "==", "value": "de"}, - ] - - result = multi_filter_retriever.run(filters=filters) - for doc in result["documents"]: - print(doc.content) diff --git a/test/components/retrievers/test_multi_filter_retriever.py b/test/components/retrievers/test_multi_filter_retriever.py deleted file mode 100644 index 4655efa591..0000000000 --- a/test/components/retrievers/test_multi_filter_retriever.py +++ /dev/null @@ -1,84 +0,0 @@ -# SPDX-FileCopyrightText: 2022-present deepset GmbH -# -# SPDX-License-Identifier: Apache-2.0 - -import pytest - -from haystack import Document -from haystack.components.retrievers.multi_filter_retriever import MultiFilterRetriever -from haystack.components.writers import DocumentWriter -from haystack.core.serialization import component_from_dict, component_to_dict -from haystack.document_stores.in_memory import InMemoryDocumentStore -from haystack.document_stores.types import DuplicatePolicy - - -@pytest.fixture -def document_store() -> InMemoryDocumentStore: - store = InMemoryDocumentStore() - DocumentWriter(document_store=store, policy=DuplicatePolicy.SKIP).run( - documents=[ - Document(content="English text", id="doc1", meta={"lang": "en"}), - Document(content="German text", id="doc2", meta={"lang": "de"}), - ] - ) - return store - - -class TestMultiFilterRetriever: - def test_init(self, in_memory_doc_store) -> None: - multi = MultiFilterRetriever(document_store=in_memory_doc_store) - assert multi.document_store == in_memory_doc_store - assert multi.max_workers == 3 - - def test_init_custom_workers(self, in_memory_doc_store) -> None: - multi = MultiFilterRetriever(document_store=in_memory_doc_store, max_workers=5) - assert multi.max_workers == 5 - - def test_run_empty_filters(self, document_store) -> None: - multi = MultiFilterRetriever(document_store=document_store) - assert multi.run(filters=[]) == {"documents": []} - - def test_run_single_filter(self, document_store) -> None: - multi = MultiFilterRetriever(document_store=document_store) - result = multi.run(filters=[{"field": "meta.lang", "operator": "==", "value": "en"}]) - assert len(result["documents"]) == 1 - assert result["documents"][0].meta["lang"] == "en" - - def test_run_multiple_filters(self, document_store) -> None: - multi = MultiFilterRetriever(document_store=document_store) - result = multi.run( - filters=[ - {"field": "meta.lang", "operator": "==", "value": "en"}, - {"field": "meta.lang", "operator": "==", "value": "de"}, - ] - ) - assert len(result["documents"]) == 2 - assert {doc.meta["lang"] for doc in result["documents"]} == {"en", "de"} - - def test_deduplication(self) -> None: - document_store = InMemoryDocumentStore() - doc = Document(content="Haystack is awesome", id="doc", meta={"lang": "en", "type": "tech"}) - DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP).run(documents=[doc]) - - multi = MultiFilterRetriever(document_store=document_store, max_workers=1) - result = multi.run( - filters=[ - {"field": "meta.lang", "operator": "==", "value": "en"}, - {"field": "meta.type", "operator": "==", "value": "tech"}, - ] - ) - assert len(result["documents"]) == 1 - assert result["documents"][0].id == "doc" - - def test_to_dict(self, document_store) -> None: - multi = MultiFilterRetriever(document_store=document_store, max_workers=2) - data = component_to_dict(multi, "multi_filter") - assert data["type"] == "haystack.components.retrievers.multi_filter_retriever.MultiFilterRetriever" - assert data["init_parameters"]["max_workers"] == 2 - - def test_from_dict(self, document_store) -> None: - multi = MultiFilterRetriever(document_store=document_store, max_workers=2) - serialized = component_to_dict(multi, "multi_filter") - deserialized = component_from_dict(MultiFilterRetriever, serialized, "multi_filter") - assert isinstance(deserialized, MultiFilterRetriever) - assert deserialized.max_workers == 2