From 3c8bf8b7de30fb5cc02375f01c17a1e754fe4848 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Mon, 26 May 2025 14:38:33 +0200 Subject: [PATCH 1/6] Add to_dict test --- .../document_stores/qdrant/document_store.py | 1 - .../qdrant/tests/test_document_store.py | 57 +++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index fdde8e2a89..7cd68f3650 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -232,7 +232,6 @@ def __init__( self.path = path self.force_disable_check_same_thread = force_disable_check_same_thread self.metadata = metadata or {} - self.api_key = api_key # Store the Qdrant collection specific attributes self.shard_number = shard_number diff --git a/integrations/qdrant/tests/test_document_store.py b/integrations/qdrant/tests/test_document_store.py index 2e496cd0fa..c6253f0ce5 100644 --- a/integrations/qdrant/tests/test_document_store.py +++ b/integrations/qdrant/tests/test_document_store.py @@ -6,6 +6,7 @@ from haystack.dataclasses import SparseEmbedding from haystack.document_stores.errors import DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy +from haystack.utils import Secret from haystack.testing.document_store import ( CountDocumentsTest, DeleteDocumentsTest, @@ -38,6 +39,62 @@ def test_init_is_lazy(self): QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True) mocked_qdrant.assert_not_called() + def test_to_dict(self, monkeypatch): + monkeypatch.setenv("QDRANT_API_KEY", "test_api_key") + doc_store = QdrantDocumentStore( + ":memory:", + recreate_index=True, + return_embedding=True, + wait_result_from_api=True, + use_sparse_embeddings=False, + api_key=Secret.from_env_var("QDRANT_API_KEY"), + ) + expected_dict = { + "type": "haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore", + "init_parameters": { + "location": ":memory:", + "url": None, + "port": 6333, + "grpc_port": 6334, + "prefer_grpc": False, + "https": None, + "api_key": { + "env_vars": ["QDRANT_API_KEY"], + "strict": True, + "type": "env_var", + }, + "prefix": None, + "timeout": None, + "host": None, + "path": None, + "force_disable_check_same_thread": False, + "index": "Document", + "embedding_dim": 768, + "on_disk": False, + "use_sparse_embeddings": False, + "sparse_idf": False, + "similarity": "cosine", + "return_embedding": True, + "progress_bar": True, + "recreate_index": True, + "shard_number": None, + "replication_factor": None, + "write_consistency_factor": None, + "on_disk_payload": None, + "hnsw_config": None, + "optimizers_config": None, + "wal_config": None, + "quantization_config": None, + "init_from": None, + "wait_result_from_api": True, + "metadata": {}, + "write_batch_size": 100, + "scroll_size": 10000, + "payload_fields_to_index": None, + }, + } + assert doc_store.to_dict() == expected_dict + def assert_documents_are_equal(self, received: List[Document], expected: List[Document]): """ Assert that two lists of Documents are equal. From e52b768fe382d6b994a97fc663a7d97720912cf2 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Tue, 27 May 2025 09:46:34 +0200 Subject: [PATCH 2/6] Add more type hints --- .../document_stores/qdrant/document_store.py | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 7cd68f3650..654439aa13 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -1,6 +1,6 @@ import inspect from itertools import islice -from typing import Any, AsyncGenerator, ClassVar, Dict, Generator, List, Optional, Set, Union +from typing import Any, AsyncGenerator, ClassVar, Dict, Generator, List, Optional, Set, Tuple, Union import numpy as np import qdrant_client @@ -21,6 +21,7 @@ convert_haystack_documents_to_qdrant_points, convert_id, convert_qdrant_point_to_haystack_document, + QdrantPoint ) from .filters import convert_filters_to_qdrant @@ -34,7 +35,7 @@ class QdrantStoreError(DocumentStoreError): FilterType = Dict[str, Union[Dict[str, Any], List[Any], str, int, float, bool]] -def get_batches_from_generator(iterable, n): +def get_batches_from_generator(iterable: List, n: int) -> Generator: """ Batch elements of an iterable into fixed-length chunks or blocks. """ @@ -127,7 +128,7 @@ def __init__( write_batch_size: int = 100, scroll_size: int = 10_000, payload_fields_to_index: Optional[List[dict]] = None, - ): + ) -> None: """ :param location: If `memory` - use in-memory Qdrant instance. @@ -164,7 +165,7 @@ def __init__( Dimension of the embeddings. :param on_disk: Whether to store the collection on disk. - :param use_sparse_embedding: + :param use_sparse_embeddings: If set to `True`, enables support for sparse embeddings. :param sparse_idf: If set to `True`, computes the Inverse Document Frequency (IDF) when using sparse embeddings. @@ -257,7 +258,7 @@ def __init__( self.write_batch_size = write_batch_size self.scroll_size = scroll_size - def _initialize_client(self): + def _initialize_client(self) -> None: if self._client is None: client_params = self._prepare_client_params() self._client = qdrant_client.QdrantClient(**client_params) @@ -273,7 +274,7 @@ def _initialize_client(self): self.payload_fields_to_index, ) - async def _initialize_async_client(self): + async def _initialize_async_client(self) -> None: """ Returns the asynchronous Qdrant client, initializing it if necessary. """ @@ -627,8 +628,6 @@ def get_documents_by_id( :param ids: A list of document IDs to retrieve. - :param index: - The name of the index to retrieve documents from. :returns: A list of documents. """ @@ -660,8 +659,6 @@ async def get_documents_by_id_async( :param ids: A list of document IDs to retrieve. - :param index: - The name of the index to retrieve documents from. :returns: A list of documents. """ @@ -1209,7 +1206,7 @@ def get_distance(self, similarity: str) -> rest.Distance: ) raise QdrantStoreError(msg) from ke - def _create_payload_index(self, collection_name: str, payload_fields_to_index: Optional[List[dict]] = None): + def _create_payload_index(self, collection_name: str, payload_fields_to_index: Optional[List[dict]] = None) -> None: """ Create payload index for the collection if payload_fields_to_index is provided See: https://qdrant.tech/documentation/concepts/indexing/#payload-index @@ -1228,7 +1225,7 @@ def _create_payload_index(self, collection_name: str, payload_fields_to_index: O async def _create_payload_index_async( self, collection_name: str, payload_fields_to_index: Optional[List[dict]] = None - ): + ) -> None: """ Asynchronously create payload index for the collection if payload_fields_to_index is provided See: https://qdrant.tech/documentation/concepts/indexing/#payload-index @@ -1256,7 +1253,7 @@ def _set_up_collection( sparse_idf: bool, on_disk: bool = False, payload_fields_to_index: Optional[List[dict]] = None, - ): + ) -> None: """ Sets up the Qdrant collection with the specified parameters. :param collection_name: @@ -1312,7 +1309,7 @@ async def _set_up_collection_async( sparse_idf: bool, on_disk: bool = False, payload_fields_to_index: Optional[List[dict]] = None, - ): + ) -> None: """ Asynchronously sets up the Qdrant collection with the specified parameters. :param collection_name: @@ -1366,7 +1363,7 @@ def recreate_collection( on_disk: Optional[bool] = None, use_sparse_embeddings: Optional[bool] = None, sparse_idf: bool = False, - ): + ) -> None: """ Recreates the Qdrant collection with the specified parameters. @@ -1409,7 +1406,7 @@ async def recreate_collection_async( on_disk: Optional[bool] = None, use_sparse_embeddings: Optional[bool] = None, sparse_idf: bool = False, - ): + ) -> None: """ Asynchronously recreates the Qdrant collection with the specified parameters. @@ -1448,7 +1445,7 @@ def _handle_duplicate_documents( self, documents: List[Document], policy: DuplicatePolicy = None, - ): + ) -> List[Document]: """ Checks whether any of the passed documents is already existing in the chosen index and returns a list of documents that are not in the index yet. @@ -1475,7 +1472,7 @@ async def _handle_duplicate_documents_async( self, documents: List[Document], policy: DuplicatePolicy = None, - ): + ) -> List[Document]: """ Asynchronously checks whether any of the passed documents is already existing in the chosen index and returns a list of @@ -1520,7 +1517,7 @@ def _drop_duplicate_documents(self, documents: List[Document]) -> List[Document] return _documents - def _prepare_collection_params(self): + def _prepare_collection_params(self) -> Dict[str, Any]: """ Prepares the common parameters for collection creation. """ @@ -1536,7 +1533,7 @@ def _prepare_collection_params(self): "init_from": self.init_from, } - def _prepare_client_params(self): + def _prepare_client_params(self) -> Dict[str, Any]: """ Prepares the common parameters for client initialization. @@ -1564,7 +1561,7 @@ def _prepare_collection_config( on_disk: Optional[bool] = None, use_sparse_embeddings: Optional[bool] = None, sparse_idf: bool = False, - ): + ) -> Tuple[Dict[str, rest.VectorParams], Optional[Dict[str, rest.SparseVectorParams]]]: """ Prepares the configuration for creating or recreating a Qdrant collection. @@ -1594,9 +1591,12 @@ def _prepare_collection_config( return vectors_config, sparse_vectors_config - def _validate_filters(self, filters: Optional[Union[Dict[str, Any], rest.Filter]] = None): + def _validate_filters(self, filters: Optional[Union[Dict[str, Any], rest.Filter]] = None) -> None: """ Validates the filters provided for querying. + + :param filters: Filters to validate. Can be a dictionary or an instance of `qdrant_client.http.models.Filter`. + :raises ValueError: If the filters are not in the correct format or syntax. """ if filters and not isinstance(filters, dict) and not isinstance(filters, rest.Filter): msg = "Filter must be a dictionary or an instance of `qdrant_client.http.models.Filter`" @@ -1606,7 +1606,7 @@ def _validate_filters(self, filters: Optional[Union[Dict[str, Any], rest.Filter] msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details." raise ValueError(msg) - def _process_query_point_results(self, results, scale_score: bool = False): + def _process_query_point_results(self, results: List[QdrantPoint], scale_score: bool = False) -> List[Document]: """ Processes query results from Qdrant. """ @@ -1626,7 +1626,7 @@ def _process_query_point_results(self, results, scale_score: bool = False): return documents - def _process_group_results(self, groups): + def _process_group_results(self, groups: List[rest.PointGroup]) -> List[Document]: """ Processes grouped query results from Qdrant. @@ -1646,7 +1646,7 @@ def _validate_collection_compatibility( collection_info, distance, embedding_dim: int, - ): + ) -> None: """ Validates that an existing collection is compatible with the current configuration. """ From 6985a8f1ce2d37d67e585767462d5a09b6f604c8 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Tue, 27 May 2025 09:49:51 +0200 Subject: [PATCH 3/6] More type hints --- .../components/retrievers/qdrant/retriever.py | 18 +++++++++--------- .../document_stores/qdrant/filters.py | 8 ++++---- .../qdrant/migrate_to_sparse.py | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py index 59b0897605..7cd9fa3901 100644 --- a/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py +++ b/integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py @@ -46,7 +46,7 @@ def __init__( score_threshold: Optional[float] = None, group_by: Optional[str] = None, group_size: Optional[int] = None, - ): + ) -> None: """ Create a QdrantEmbeddingRetriever component. @@ -136,7 +136,7 @@ def run( score_threshold: Optional[float] = None, group_by: Optional[str] = None, group_size: Optional[int] = None, - ): + ) -> Dict[str, List[Document]]: """ Run the Embedding Retriever on the given input data. @@ -180,7 +180,7 @@ async def run_async( score_threshold: Optional[float] = None, group_by: Optional[str] = None, group_size: Optional[int] = None, - ): + ) -> Dict[str, List[Document]]: """ Asynchronously run the Embedding Retriever on the given input data. @@ -252,7 +252,7 @@ def __init__( score_threshold: Optional[float] = None, group_by: Optional[str] = None, group_size: Optional[int] = None, - ): + ) -> None: """ Create a QdrantSparseEmbeddingRetriever component. @@ -342,7 +342,7 @@ def run( score_threshold: Optional[float] = None, group_by: Optional[str] = None, group_size: Optional[int] = None, - ): + ) -> Dict[str, List[Document]]: """ Run the Sparse Embedding Retriever on the given input data. @@ -391,7 +391,7 @@ async def run_async( score_threshold: Optional[float] = None, group_by: Optional[str] = None, group_size: Optional[int] = None, - ): + ) -> Dict[str, List[Document]]: """ Asynchronously run the Sparse Embedding Retriever on the given input data. @@ -473,7 +473,7 @@ def __init__( score_threshold: Optional[float] = None, group_by: Optional[str] = None, group_size: Optional[int] = None, - ): + ) -> None: """ Create a QdrantHybridRetriever component. @@ -557,7 +557,7 @@ def run( score_threshold: Optional[float] = None, group_by: Optional[str] = None, group_size: Optional[int] = None, - ): + ) -> Dict[str, List[Document]]: """ Run the Sparse Embedding Retriever on the given input data. @@ -606,7 +606,7 @@ async def run_async( score_threshold: Optional[float] = None, group_by: Optional[str] = None, group_size: Optional[int] = None, - ): + ) -> Dict[str, List[Document]]: """ Asynchronously run the Sparse Embedding Retriever on the given input data. diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/filters.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/filters.py index 69fd7cbbd1..36d512ae4f 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/filters.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/filters.py @@ -138,10 +138,10 @@ def convert_filters_to_qdrant( def build_filters_for_repeated_operators( - must_clauses, - should_clauses, - must_not_clauses, - qdrant_filter, + must_clauses: List, + should_clauses: List, + must_not_clauses: List, + qdrant_filter: List[models.Filter], ) -> List[models.Filter]: """ Flattens the nested lists of clauses by creating separate Filters for each clause of a logical operator. diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/migrate_to_sparse.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/migrate_to_sparse.py index 954269ce4b..0af7096828 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/migrate_to_sparse.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/migrate_to_sparse.py @@ -11,7 +11,7 @@ logger.setLevel(python_logging.INFO) -def migrate_to_sparse_embeddings_support(old_document_store: QdrantDocumentStore, new_index: str): +def migrate_to_sparse_embeddings_support(old_document_store: QdrantDocumentStore, new_index: str) -> None: """ Utility function to migrate an existing `QdrantDocumentStore` to a new one with support for sparse embeddings. From f856fb30e25fd2adb3e4bb48fc540f8eba409741 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Tue, 27 May 2025 10:19:54 +0200 Subject: [PATCH 4/6] Add fix for exposing api key in metadata when running to_dict --- .../document_stores/qdrant/document_store.py | 41 +++++++++++-------- .../qdrant/tests/test_document_store.py | 2 +- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 654439aa13..26fb68aa75 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -1,4 +1,5 @@ import inspect +from copy import deepcopy from itertools import islice from typing import Any, AsyncGenerator, ClassVar, Dict, Generator, List, Optional, Set, Tuple, Union @@ -18,10 +19,10 @@ from .converters import ( DENSE_VECTORS_NAME, SPARSE_VECTORS_NAME, + QdrantPoint, convert_haystack_documents_to_qdrant_points, convert_id, convert_qdrant_point_to_haystack_document, - QdrantPoint ) from .filters import convert_filters_to_qdrant @@ -131,7 +132,7 @@ def __init__( ) -> None: """ :param location: - If `memory` - use in-memory Qdrant instance. + If `":memory:"` - use in-memory Qdrant instance. If `str` - use it as a URL parameter. If `None` - use default values for host and port. :param url: @@ -261,6 +262,7 @@ def __init__( def _initialize_client(self) -> None: if self._client is None: client_params = self._prepare_client_params() + # This step adds the api-key and User-Agent to metadata self._client = qdrant_client.QdrantClient(**client_params) # Make sure the collection is properly set up self._set_up_collection( @@ -1538,21 +1540,26 @@ def _prepare_client_params(self) -> Dict[str, Any]: Prepares the common parameters for client initialization. """ - return { - "location": self.location, - "url": self.url, - "port": self.port, - "grpc_port": self.grpc_port, - "prefer_grpc": self.prefer_grpc, - "https": self.https, - "api_key": self.api_key.resolve_value() if self.api_key else None, - "prefix": self.prefix, - "timeout": self.timeout, - "host": self.host, - "path": self.path, - "metadata": self.metadata, - "force_disable_check_same_thread": self.force_disable_check_same_thread, - } + # NOTE: We need to use deepcopy here to avoid modifying the original class attributes. + # For example, the resolved api key is added to metadata by the QdrantClient class when using a hosted + # Qdrant service, which means running to_dict() exposes the api key. + return deepcopy( + { + "location": self.location, + "url": self.url, + "port": self.port, + "grpc_port": self.grpc_port, + "prefer_grpc": self.prefer_grpc, + "https": self.https, + "api_key": self.api_key.resolve_value() if self.api_key else None, + "prefix": self.prefix, + "timeout": self.timeout, + "host": self.host, + "path": self.path, + "metadata": self.metadata, + "force_disable_check_same_thread": self.force_disable_check_same_thread, + } + ) def _prepare_collection_config( self, diff --git a/integrations/qdrant/tests/test_document_store.py b/integrations/qdrant/tests/test_document_store.py index c6253f0ce5..8fa3f0acc2 100644 --- a/integrations/qdrant/tests/test_document_store.py +++ b/integrations/qdrant/tests/test_document_store.py @@ -6,13 +6,13 @@ from haystack.dataclasses import SparseEmbedding from haystack.document_stores.errors import DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy -from haystack.utils import Secret from haystack.testing.document_store import ( CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest, _random_embeddings, ) +from haystack.utils import Secret from qdrant_client.http import models as rest from haystack_integrations.document_stores.qdrant.document_store import ( From 701cf450bb24d15603e41d21c86728b4501d253a Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Tue, 27 May 2025 10:27:12 +0200 Subject: [PATCH 5/6] Add unit test --- .../qdrant/tests/test_document_store.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/integrations/qdrant/tests/test_document_store.py b/integrations/qdrant/tests/test_document_store.py index 8fa3f0acc2..1920aaf8c2 100644 --- a/integrations/qdrant/tests/test_document_store.py +++ b/integrations/qdrant/tests/test_document_store.py @@ -39,6 +39,23 @@ def test_init_is_lazy(self): QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True) mocked_qdrant.assert_not_called() + def test_prepare_client_params_no_mutability(self): + metadata = {"key": "value"} + doc_store = QdrantDocumentStore( + ":memory:", + recreate_index=True, + return_embedding=True, + wait_result_from_api=True, + use_sparse_embeddings=False, + metadata=metadata, + ) + client_params = doc_store._prepare_client_params() + # Mutate value of metadata in client_params + client_params["metadata"] = client_params["metadata"].update({"new_key": "new_value"}) + + # Assert that the original metadata in the document store is unchanged + assert metadata == {"key": "value"} + def test_to_dict(self, monkeypatch): monkeypatch.setenv("QDRANT_API_KEY", "test_api_key") doc_store = QdrantDocumentStore( From 2633b200f166949f0788d44a80195e0ff31f6dbc Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Tue, 27 May 2025 10:41:43 +0200 Subject: [PATCH 6/6] PR comments --- .../document_stores/qdrant/document_store.py | 39 +++++++++---------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py index 26fb68aa75..357b5adb04 100644 --- a/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py +++ b/integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py @@ -1,5 +1,4 @@ import inspect -from copy import deepcopy from itertools import islice from typing import Any, AsyncGenerator, ClassVar, Dict, Generator, List, Optional, Set, Tuple, Union @@ -1540,26 +1539,24 @@ def _prepare_client_params(self) -> Dict[str, Any]: Prepares the common parameters for client initialization. """ - # NOTE: We need to use deepcopy here to avoid modifying the original class attributes. - # For example, the resolved api key is added to metadata by the QdrantClient class when using a hosted - # Qdrant service, which means running to_dict() exposes the api key. - return deepcopy( - { - "location": self.location, - "url": self.url, - "port": self.port, - "grpc_port": self.grpc_port, - "prefer_grpc": self.prefer_grpc, - "https": self.https, - "api_key": self.api_key.resolve_value() if self.api_key else None, - "prefix": self.prefix, - "timeout": self.timeout, - "host": self.host, - "path": self.path, - "metadata": self.metadata, - "force_disable_check_same_thread": self.force_disable_check_same_thread, - } - ) + return { + "location": self.location, + "url": self.url, + "port": self.port, + "grpc_port": self.grpc_port, + "prefer_grpc": self.prefer_grpc, + "https": self.https, + "api_key": self.api_key.resolve_value() if self.api_key else None, + "prefix": self.prefix, + "timeout": self.timeout, + "host": self.host, + "path": self.path, + # NOTE: We purposefully expand the fields of self.metadata to avoid modifying the original self.metadata + # class attribute. For example, the resolved api key is added to metadata by the QdrantClient class + # when using a hosted Qdrant service, which means running to_dict() exposes the api key. + "metadata": {**self.metadata}, + "force_disable_check_same_thread": self.force_disable_check_same_thread, + } def _prepare_collection_config( self,