From 09aa131b8439834b07d1b48ad3cadc6974ea2061 Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Thu, 2 Apr 2026 20:16:45 +0500 Subject: [PATCH 01/10] refactor(qdrant): use async DocumentStore mixin tests Closes #3052 Replaces duplicate async test implementations with the mixin classes from haystack.testing.document_store and haystack.testing.document_store_async. - Removes 19 tests now covered by the mixins - Keeps all Qdrant-specific tests (hybrid search, sparse config, collection setup validation, vector preservation) --- .../qdrant/tests/test_document_store_async.py | 601 +++--------------- 1 file changed, 106 insertions(+), 495 deletions(-) diff --git a/integrations/qdrant/tests/test_document_store_async.py b/integrations/qdrant/tests/test_document_store_async.py index dd6467b657..4e9f3ed9b8 100644 --- a/integrations/qdrant/tests/test_document_store_async.py +++ b/integrations/qdrant/tests/test_document_store_async.py @@ -1,13 +1,26 @@ from unittest.mock import MagicMock, patch import pytest +import pytest_asyncio from haystack import Document from haystack.dataclasses import SparseEmbedding -from haystack.document_stores.errors import DuplicateDocumentError -from haystack.document_stores.types import DuplicatePolicy from haystack.testing.document_store import ( + CountDocumentsByFilterAsyncTest, + CountUniqueMetadataByFilterAsyncTest, + UpdateByFilterAsyncTest, _random_embeddings, ) +from haystack.testing.document_store_async import ( + CountDocumentsAsyncTest, + DeleteAllAsyncTest, + DeleteByFilterAsyncTest, + DeleteDocumentsAsyncTest, + FilterDocumentsAsyncTest, + GetMetadataFieldMinMaxAsyncTest, + GetMetadataFieldsInfoAsyncTest, + GetMetadataFieldUniqueValuesAsyncTest, + WriteDocumentsAsyncTest, +) from qdrant_client.http import models as rest from haystack_integrations.document_stores.qdrant.document_store import ( @@ -18,117 +31,8 @@ ) -class TestQdrantDocumentStore: - @pytest.fixture - def document_store(self) -> QdrantDocumentStore: - return QdrantDocumentStore( - ":memory:", - recreate_index=True, - return_embedding=True, - wait_result_from_api=True, - use_sparse_embeddings=False, - progress_bar=False, - ) - - @pytest.mark.asyncio - async def test_write_documents_async(self, document_store: QdrantDocumentStore): - docs = [Document(id="1")] - - result = await document_store.write_documents_async(docs) - assert result == 1 - with pytest.raises(DuplicateDocumentError): - await document_store.write_documents_async(docs, DuplicatePolicy.FAIL) - - @pytest.mark.asyncio - async def test_sparse_configuration_async(self): - document_store = QdrantDocumentStore( - ":memory:", - recreate_index=True, - use_sparse_embeddings=True, - sparse_idf=True, - ) - await document_store._initialize_async_client() - - collection = await document_store._async_client.get_collection("Document") - sparse_config = collection.config.params.sparse_vectors - - assert SPARSE_VECTORS_NAME in sparse_config - - # check that the `sparse_idf` parameter takes effect - assert hasattr(sparse_config[SPARSE_VECTORS_NAME], "modifier") - assert sparse_config[SPARSE_VECTORS_NAME].modifier == rest.Modifier.IDF - - @pytest.mark.asyncio - async def test_query_hybrid_async(self, generate_sparse_embedding): - document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True, progress_bar=False) - - docs = [] - for i in range(20): - docs.append( - Document( - content=f"doc {i}", sparse_embedding=generate_sparse_embedding(), embedding=_random_embeddings(768) - ) - ) - - await document_store.write_documents_async(docs) - sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33]) - embedding = [0.1] * 768 - - results: list[Document] = await document_store._query_hybrid_async( - query_sparse_embedding=sparse_embedding, query_embedding=embedding, top_k=10, return_embedding=True - ) - assert len(results) == 10 - - for document in results: - assert document.sparse_embedding - assert document.embedding - - @pytest.mark.asyncio - async def test_query_hybrid_with_group_by_async(self, generate_sparse_embedding): - document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True, progress_bar=False) - - docs = [] - for i in range(20): - docs.append( - Document( - content=f"doc {i}", - sparse_embedding=generate_sparse_embedding(), - embedding=_random_embeddings(768), - meta={"group_field": i // 2}, - ) - ) - - await document_store.write_documents_async(docs) - - sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33]) - embedding = [0.1] * 768 - - results: list[Document] = await document_store._query_hybrid_async( - query_sparse_embedding=sparse_embedding, - query_embedding=embedding, - top_k=3, - return_embedding=True, - group_by="meta.group_field", - group_size=2, - ) - assert len(results) == 6 - - for document in results: - assert document.sparse_embedding - assert document.embedding - - @pytest.mark.asyncio - async def test_query_hybrid_fail_without_sparse_embedding_async(self, document_store): - sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33]) - embedding = [0.1] * 768 - - with pytest.raises(QdrantStoreError): - await document_store._query_hybrid_async( - query_sparse_embedding=sparse_embedding, - query_embedding=embedding, - ) - - @pytest.mark.asyncio +@pytest.mark.asyncio +class TestQdrantDocumentStoreAsyncUnit: async def test_query_hybrid_search_batch_failure_async(self): document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True) await document_store._initialize_async_client() @@ -141,11 +45,9 @@ async def test_query_hybrid_search_batch_failure_async(self): query_sparse_embedding=sparse_embedding, query_embedding=embedding ) - @pytest.mark.asyncio async def test_set_up_collection_with_dimension_mismatch_async(self): document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=False, similarity="cosine") await document_store._initialize_async_client() - # Mock collection info with different vector size mock_collection_info = MagicMock() mock_collection_info.config.params.vectors = MagicMock() mock_collection_info.config.params.vectors.distance = rest.Distance.COSINE @@ -158,11 +60,9 @@ async def test_set_up_collection_with_dimension_mismatch_async(self): with pytest.raises(ValueError, match="different vector size"): await document_store._set_up_collection_async("test_collection", 768, False, "cosine", False, False) - @pytest.mark.asyncio async def test_set_up_collection_with_existing_incompatible_collection_async(self): document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True) await document_store._initialize_async_client() - # Mock collection info with named vectors but missing DENSE_VECTORS_NAME mock_collection_info = MagicMock() mock_collection_info.config.params.vectors = {"some_other_vector": MagicMock()} @@ -173,13 +73,10 @@ async def test_set_up_collection_with_existing_incompatible_collection_async(sel with pytest.raises(QdrantStoreError, match="created outside of Haystack"): await document_store._set_up_collection_async("test_collection", 768, False, "cosine", True, False) - @pytest.mark.asyncio async def test_set_up_collection_use_sparse_embeddings_true_without_named_vectors_async(self): """Test that an error is raised when use_sparse_embeddings is True but collection doesn't have named vectors""" document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True) await document_store._initialize_async_client() - - # Mock collection info without named vectors mock_collection_info = MagicMock() mock_collection_info.config.params.vectors = MagicMock(spec=rest.VectorsConfig) @@ -190,12 +87,10 @@ async def test_set_up_collection_use_sparse_embeddings_true_without_named_vector with pytest.raises(QdrantStoreError, match="without sparse embedding vectors"): await document_store._set_up_collection_async("test_collection", 768, False, "cosine", True, False) - @pytest.mark.asyncio async def test_set_up_collection_use_sparse_embeddings_false_with_named_vectors_async(self): """Test that an error is raised when use_sparse_embeddings is False but collection has named vectors""" document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=False) await document_store._initialize_async_client() - # Mock collection info with named vectors mock_collection_info = MagicMock() mock_collection_info.config.params.vectors = {DENSE_VECTORS_NAME: MagicMock()} @@ -206,12 +101,9 @@ async def test_set_up_collection_use_sparse_embeddings_false_with_named_vectors_ with pytest.raises(QdrantStoreError, match="with sparse embedding vectors"): await document_store._set_up_collection_async("test_collection", 768, False, "cosine", False, False) - @pytest.mark.asyncio async def test_set_up_collection_with_distance_mismatch_async(self): document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=False, similarity="cosine") await document_store._initialize_async_client() - - # Mock collection info with different distance mock_collection_info = MagicMock() mock_collection_info.config.params.vectors = MagicMock() mock_collection_info.config.params.vectors.distance = rest.Distance.DOT @@ -224,245 +116,118 @@ async def test_set_up_collection_with_distance_mismatch_async(self): with pytest.raises(ValueError, match="different similarity"): await document_store._set_up_collection_async("test_collection", 768, False, "cosine", False, False) - @pytest.mark.asyncio - async def test_delete_all_documents_async_no_index_recreation(self, document_store): - await document_store._initialize_async_client() - - # write some documents - docs = [Document(id=str(i)) for i in range(5)] - await document_store.write_documents_async(docs) - - # delete all documents without recreating the index - await document_store.delete_all_documents_async(recreate_index=False) - assert await document_store.count_documents_async() == 0 - # ensure the collection still exists by writing documents again - await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 5 +@pytest.mark.integration +@pytest.mark.asyncio +class TestQdrantDocumentStoreAsync( + CountDocumentsAsyncTest, + WriteDocumentsAsyncTest, + DeleteDocumentsAsyncTest, + DeleteAllAsyncTest, + DeleteByFilterAsyncTest, + FilterDocumentsAsyncTest, + UpdateByFilterAsyncTest, + CountDocumentsByFilterAsyncTest, + CountUniqueMetadataByFilterAsyncTest, + GetMetadataFieldsInfoAsyncTest, + GetMetadataFieldMinMaxAsyncTest, + GetMetadataFieldUniqueValuesAsyncTest, +): + @pytest_asyncio.fixture + async def document_store(self): + store = QdrantDocumentStore( + ":memory:", + recreate_index=True, + return_embedding=True, + wait_result_from_api=True, + use_sparse_embeddings=False, + progress_bar=False, + ) + yield store - @pytest.mark.asyncio - async def test_delete_all_documents_async_index_recreation(self, document_store): + async def test_sparse_configuration_async(self): + document_store = QdrantDocumentStore( + ":memory:", + recreate_index=True, + use_sparse_embeddings=True, + sparse_idf=True, + ) await document_store._initialize_async_client() - # write some documents - docs = [Document(id=str(i)) for i in range(5)] - await document_store.write_documents_async(docs) - - # get the current document_store config - config_before = await document_store._async_client.get_collection(document_store.index) - - # delete all documents with recreating the index - await document_store.delete_all_documents_async(recreate_index=True) - assert await document_store.count_documents_async() == 0 - - # assure that with the same config - config_after = await document_store._async_client.get_collection(document_store.index) - - assert config_before.config == config_after.config + collection = await document_store._async_client.get_collection("Document") + sparse_config = collection.config.params.sparse_vectors - # ensure the collection still exists by writing documents again - await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 5 + assert SPARSE_VECTORS_NAME in sparse_config + assert hasattr(sparse_config[SPARSE_VECTORS_NAME], "modifier") + assert sparse_config[SPARSE_VECTORS_NAME].modifier == rest.Modifier.IDF - @pytest.mark.asyncio - async def test_delete_by_filter_async(self, document_store: QdrantDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023}), - Document(content="Doc 2", meta={"category": "B", "year": 2023}), - Document(content="Doc 3", meta={"category": "A", "year": 2024}), - ] - await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 3 + async def test_query_hybrid_async(self, generate_sparse_embedding): + document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True, progress_bar=False) - # Delete documents with category="A" - deleted_count = await document_store.delete_by_filter_async( - filters={"field": "meta.category", "operator": "==", "value": "A"} - ) - assert deleted_count == 2 - assert await document_store.count_documents_async() == 1 - - # Verify only category B remains - remaining_docs = [] - async for doc in document_store._get_documents_generator_async(): - remaining_docs.append(doc) - assert len(remaining_docs) == 1 - assert remaining_docs[0].meta["category"] == "B" - - # Delete remaining document by year - deleted_count = await document_store.delete_by_filter_async( - filters={"field": "meta.year", "operator": "==", "value": 2023} - ) - assert deleted_count == 1 - assert await document_store.count_documents_async() == 0 + docs = [] + for i in range(20): + docs.append( + Document( + content=f"doc {i}", sparse_embedding=generate_sparse_embedding(), embedding=_random_embeddings(768) + ) + ) - @pytest.mark.asyncio - async def test_delete_by_filter_async_no_matches(self, document_store: QdrantDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - ] await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 2 - - # Try to delete documents with category="C" (no matches) - deleted_count = await document_store.delete_by_filter_async( - filters={"field": "meta.category", "operator": "==", "value": "C"} - ) - assert deleted_count == 0 - assert await document_store.count_documents_async() == 2 + sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33]) + embedding = [0.1] * 768 - @pytest.mark.asyncio - async def test_delete_by_filter_async_advanced_filters(self, document_store: QdrantDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023, "status": "draft"}), - Document(content="Doc 2", meta={"category": "A", "year": 2024, "status": "published"}), - Document(content="Doc 3", meta={"category": "B", "year": 2023, "status": "draft"}), - ] - await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 3 - - # AND condition (matches only Doc 1) - deleted_count = await document_store.delete_by_filter_async( - filters={ - "operator": "AND", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "A"}, - {"field": "meta.year", "operator": "==", "value": 2023}, - ], - } - ) - assert deleted_count == 1 - assert await document_store.count_documents_async() == 2 - - # OR condition (matches Doc 2 and Doc 3) - deleted_count = await document_store.delete_by_filter_async( - filters={ - "operator": "OR", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "B"}, - {"field": "meta.status", "operator": "==", "value": "published"}, - ], - } + results: list[Document] = await document_store._query_hybrid_async( + query_sparse_embedding=sparse_embedding, query_embedding=embedding, top_k=10, return_embedding=True ) - assert deleted_count == 2 - assert await document_store.count_documents_async() == 0 + assert len(results) == 10 - @pytest.mark.asyncio - async def test_update_by_filter_async(self, document_store: QdrantDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "draft"}), - Document(content="Doc 2", meta={"category": "B", "status": "draft"}), - Document(content="Doc 3", meta={"category": "A", "status": "draft"}), - ] - await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 3 + for document in results: + assert document.sparse_embedding + assert document.embedding - # Update status for category="A" documents - updated_count = await document_store.update_by_filter_async( - filters={"field": "meta.category", "operator": "==", "value": "A"}, meta={"status": "published"} - ) - assert updated_count == 2 + async def test_query_hybrid_with_group_by_async(self, generate_sparse_embedding): + document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True, progress_bar=False) - # Verify the updated documents have the new metadata - published_docs = [] - async for doc in document_store._get_documents_generator_async( - filters={"field": "meta.status", "operator": "==", "value": "published"} - ): - published_docs.append(doc) - assert len(published_docs) == 2 - for doc in published_docs: - assert doc.meta["status"] == "published" - assert doc.meta["category"] == "A" - - # Verify documents with category="B" were not updated - draft_docs = [] - async for doc in document_store._get_documents_generator_async( - filters={"field": "meta.status", "operator": "==", "value": "draft"} - ): - draft_docs.append(doc) - assert len(draft_docs) == 1 - assert draft_docs[0].meta["category"] == "B" + docs = [] + for i in range(20): + docs.append( + Document( + content=f"doc {i}", + sparse_embedding=generate_sparse_embedding(), + embedding=_random_embeddings(768), + meta={"group_field": i // 2}, + ) + ) - @pytest.mark.asyncio - async def test_update_by_filter_async_multiple_fields(self, document_store: QdrantDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023}), - Document(content="Doc 2", meta={"category": "A", "year": 2023}), - Document(content="Doc 3", meta={"category": "B", "year": 2024}), - ] await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 3 - - # Update multiple fields for category="A" documents - updated_count = await document_store.update_by_filter_async( - filters={"field": "meta.category", "operator": "==", "value": "A"}, - meta={"status": "published", "reviewed": True}, - ) - assert updated_count == 2 - # Verify updates - published_docs = [] - async for doc in document_store._get_documents_generator_async( - filters={"field": "meta.status", "operator": "==", "value": "published"} - ): - published_docs.append(doc) - assert len(published_docs) == 2 - for doc in published_docs: - assert doc.meta["status"] == "published" - assert doc.meta["reviewed"] is True - assert doc.meta["category"] == "A" - assert doc.meta["year"] == 2023 # Existing field preserved - - @pytest.mark.asyncio - async def test_update_by_filter_async_no_matches(self, document_store: QdrantDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - ] - await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 2 + sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33]) + embedding = [0.1] * 768 - # Try to update documents with category="C" (no matches) - updated_count = await document_store.update_by_filter_async( - filters={"field": "meta.category", "operator": "==", "value": "C"}, meta={"status": "published"} + results: list[Document] = await document_store._query_hybrid_async( + query_sparse_embedding=sparse_embedding, + query_embedding=embedding, + top_k=3, + return_embedding=True, + group_by="meta.group_field", + group_size=2, ) - assert updated_count == 0 - assert await document_store.count_documents_async() == 2 + assert len(results) == 6 - @pytest.mark.asyncio - async def test_update_by_filter_async_advanced_filters(self, document_store: QdrantDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023, "status": "draft"}), - Document(content="Doc 2", meta={"category": "A", "year": 2024, "status": "draft"}), - Document(content="Doc 3", meta={"category": "B", "year": 2023, "status": "draft"}), - ] - await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 3 + for document in results: + assert document.sparse_embedding + assert document.embedding - # Update with AND condition - updated_count = await document_store.update_by_filter_async( - filters={ - "operator": "AND", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "A"}, - {"field": "meta.year", "operator": "==", "value": 2023}, - ], - }, - meta={"status": "published"}, - ) - assert updated_count == 1 + async def test_query_hybrid_fail_without_sparse_embedding_async(self, document_store): + sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33]) + embedding = [0.1] * 768 - # Verify only one document was updated - published_docs = [] - async for doc in document_store._get_documents_generator_async( - filters={"field": "meta.status", "operator": "==", "value": "published"} - ): - published_docs.append(doc) - assert len(published_docs) == 1 - assert published_docs[0].meta["category"] == "A" - assert published_docs[0].meta["year"] == 2023 + with pytest.raises(QdrantStoreError): + await document_store._query_hybrid_async( + query_sparse_embedding=sparse_embedding, + query_embedding=embedding, + ) - @pytest.mark.asyncio async def test_update_by_filter_async_preserves_vectors(self, document_store: QdrantDocumentStore): """Test that update_by_filter_async preserves document embeddings.""" docs = [ @@ -471,13 +236,11 @@ async def test_update_by_filter_async_preserves_vectors(self, document_store: Qd ] await document_store.write_documents_async(docs) - # Update metadata updated_count = await document_store.update_by_filter_async( filters={"field": "meta.category", "operator": "==", "value": "A"}, meta={"status": "published"} ) assert updated_count == 1 - # Verify embedding is preserved updated_docs = [] async for doc in document_store._get_documents_generator_async( filters={"field": "meta.status", "operator": "==", "value": "published"} @@ -486,155 +249,3 @@ async def test_update_by_filter_async_preserves_vectors(self, document_store: Qd assert len(updated_docs) == 1 assert updated_docs[0].embedding is not None assert len(updated_docs[0].embedding) == 768 - - @pytest.mark.asyncio - async def test_count_documents_by_filter_async(self, document_store: QdrantDocumentStore): - """Test counting documents with filters (async).""" - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023}), - Document(content="Doc 2", meta={"category": "A", "year": 2024}), - Document(content="Doc 3", meta={"category": "B", "year": 2023}), - Document(content="Doc 4", meta={"category": "B", "year": 2024}), - ] - await document_store.write_documents_async(docs) - - # Test counting all documents - count = await document_store.count_documents_async() - assert count == 4 - - # Test counting with single filter - count = await document_store.count_documents_by_filter_async( - filters={"field": "meta.category", "operator": "==", "value": "A"} - ) - assert count == 2 - - # Test counting with multiple filters - count = await document_store.count_documents_by_filter_async( - filters={ - "operator": "AND", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "B"}, - {"field": "meta.year", "operator": "==", "value": 2023}, - ], - } - ) - assert count == 1 - - @pytest.mark.asyncio - async def test_get_metadata_fields_info_async(self, document_store: QdrantDocumentStore): - """Test getting metadata field information (async).""" - docs = [ - Document(content="Doc 1", meta={"category": "A", "score": 0.9, "tags": ["tag1", "tag2"]}), - Document(content="Doc 2", meta={"category": "B", "score": 0.8, "tags": ["tag2"]}), - ] - await document_store.write_documents_async(docs) - - fields_info = await document_store.get_metadata_fields_info_async() - # Should return empty dict or field info depending on Qdrant collection setup - assert isinstance(fields_info, dict) - - @pytest.mark.asyncio - async def test_get_metadata_field_min_max_async(self, document_store: QdrantDocumentStore): - """Test getting min/max values for a metadata field (async).""" - docs = [ - Document(content="Doc 1", meta={"score": 0.5}), - Document(content="Doc 2", meta={"score": 0.8}), - Document(content="Doc 3", meta={"score": 0.3}), - ] - await document_store.write_documents_async(docs) - - result = await document_store.get_metadata_field_min_max_async("score") - assert result.get("min") == 0.3 - assert result.get("max") == 0.8 - - @pytest.mark.asyncio - async def test_count_unique_metadata_by_filter_async(self, document_store: QdrantDocumentStore): - """Test counting unique metadata field values (async).""" - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - Document(content="Doc 3", meta={"category": "A"}), - Document(content="Doc 4", meta={"category": "C"}), - ] - await document_store.write_documents_async(docs) - - result = await document_store.count_unique_metadata_by_filter_async(filters={}, metadata_fields=["category"]) - assert result == {"category": 3} - - @pytest.mark.asyncio - async def test_count_unique_metadata_by_filter_async_multiple_fields(self, document_store: QdrantDocumentStore): - """Test counting unique values for multiple metadata fields (async).""" - docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "active"}), - Document(content="Doc 2", meta={"category": "B", "status": "active"}), - Document(content="Doc 3", meta={"category": "A", "status": "inactive"}), - ] - await document_store.write_documents_async(docs) - - result = await document_store.count_unique_metadata_by_filter_async( - filters={}, metadata_fields=["category", "status"] - ) - assert result == {"category": 2, "status": 2} - - @pytest.mark.asyncio - async def test_count_unique_metadata_by_filter_async_with_filter(self, document_store: QdrantDocumentStore): - """Test counting unique metadata field values with filtering (async).""" - docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "active"}), - Document(content="Doc 2", meta={"category": "B", "status": "active"}), - Document(content="Doc 3", meta={"category": "A", "status": "inactive"}), - ] - await document_store.write_documents_async(docs) - - result = await document_store.count_unique_metadata_by_filter_async( - filters={"field": "meta.status", "operator": "==", "value": "active"}, - metadata_fields=["category"], - ) - assert result == {"category": 2} - - @pytest.mark.asyncio - async def test_get_metadata_field_unique_values_async(self, document_store: QdrantDocumentStore): - """Test getting unique metadata field values (async).""" - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - Document(content="Doc 3", meta={"category": "A"}), - Document(content="Doc 4", meta={"category": "C"}), - ] - await document_store.write_documents_async(docs) - - values = await document_store.get_metadata_field_unique_values_async("category") - assert len(values) == 3 - assert set(values) == {"A", "B", "C"} - - @pytest.mark.asyncio - async def test_get_metadata_field_unique_values_async_pagination(self, document_store: QdrantDocumentStore): - """Test getting unique metadata field values with pagination (async).""" - docs = [Document(content=f"Doc {i}", meta={"value": i % 5}) for i in range(10)] - await document_store.write_documents_async(docs) - - # Get first 2 unique values - values_page_1 = await document_store.get_metadata_field_unique_values_async("value", limit=2, offset=0) - assert len(values_page_1) == 2 - - # Get next 2 unique values - values_page_2 = await document_store.get_metadata_field_unique_values_async("value", limit=2, offset=2) - assert len(values_page_2) == 2 - - # Values should not overlap - assert set(values_page_1) != set(values_page_2) - - @pytest.mark.asyncio - async def test_get_metadata_field_unique_values_async_with_filter(self, document_store: QdrantDocumentStore): - """Test getting unique metadata field values with filtering (async).""" - docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "active"}), - Document(content="Doc 2", meta={"category": "B", "status": "active"}), - Document(content="Doc 3", meta={"category": "A", "status": "inactive"}), - ] - await document_store.write_documents_async(docs) - - values = await document_store.get_metadata_field_unique_values_async( - "category", filters={"field": "meta.status", "operator": "==", "value": "active"} - ) - assert set(values) == {"A", "B"} From 83cb15198fa99f375b88aa14c82fa7c6ba03a6cb Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Thu, 2 Apr 2026 20:22:06 +0500 Subject: [PATCH 02/10] refactor(pgvector): use async DocumentStore mixin tests Closes #3050 Replaces duplicate async test implementations with mixin classes from haystack.testing.document_store and haystack.testing.document_store_async. - Removes 17 tests now covered by the mixins - Keeps all pgvector-specific tests (blob write, connection recreation, invalid connection, empty meta validation, hnsw index, table management) --- .../tests/test_document_store_async.py | 645 +----------------- 1 file changed, 31 insertions(+), 614 deletions(-) diff --git a/integrations/pgvector/tests/test_document_store_async.py b/integrations/pgvector/tests/test_document_store_async.py index 64b4a49bbb..14a938655f 100644 --- a/integrations/pgvector/tests/test_document_store_async.py +++ b/integrations/pgvector/tests/test_document_store_async.py @@ -7,8 +7,23 @@ import psycopg import pytest from haystack.dataclasses.document import ByteStream, Document -from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError -from haystack.document_stores.types import DuplicatePolicy +from haystack.document_stores.errors import DocumentStoreError +from haystack.testing.document_store import ( + CountDocumentsByFilterAsyncTest, + CountUniqueMetadataByFilterAsyncTest, + UpdateByFilterAsyncTest, +) +from haystack.testing.document_store_async import ( + CountDocumentsAsyncTest, + DeleteAllAsyncTest, + DeleteByFilterAsyncTest, + DeleteDocumentsAsyncTest, + FilterDocumentsAsyncTest, + GetMetadataFieldMinMaxAsyncTest, + GetMetadataFieldsInfoAsyncTest, + GetMetadataFieldUniqueValuesAsyncTest, + WriteDocumentsAsyncTest, +) from haystack.utils import Secret from psycopg.sql import SQL @@ -17,13 +32,20 @@ @pytest.mark.integration @pytest.mark.asyncio -class TestDocumentStoreAsync: - async def test_write_documents(self, document_store: PgvectorDocumentStore): - docs = [Document(id="1")] - assert await document_store.write_documents_async(docs) == 1 - with pytest.raises(DuplicateDocumentError): - await document_store.write_documents_async(docs, DuplicatePolicy.FAIL) - +class TestDocumentStoreAsync( + CountDocumentsAsyncTest, + WriteDocumentsAsyncTest, + DeleteDocumentsAsyncTest, + DeleteAllAsyncTest, + DeleteByFilterAsyncTest, + FilterDocumentsAsyncTest, + UpdateByFilterAsyncTest, + CountDocumentsByFilterAsyncTest, + CountUniqueMetadataByFilterAsyncTest, + GetMetadataFieldsInfoAsyncTest, + GetMetadataFieldMinMaxAsyncTest, + GetMetadataFieldUniqueValuesAsyncTest, +): async def test_write_blob(self, document_store: PgvectorDocumentStore): bytestream = ByteStream(b"test", meta={"meta_key": "meta_value"}, mime_type="mime_type") docs = [Document(id="1", blob=bytestream)] @@ -32,46 +54,6 @@ async def test_write_blob(self, document_store: PgvectorDocumentStore): retrieved_docs = await document_store.filter_documents_async() assert retrieved_docs == docs - async def test_count_documents(self, document_store: PgvectorDocumentStore): - await document_store.write_documents_async( - [ - Document(content="test doc 1"), - Document(content="test doc 2"), - Document(content="test doc 3"), - ] - ) - assert await document_store.count_documents_async() == 3 - - async def test_filter_documents(self, document_store: PgvectorDocumentStore): - filterable_docs = [ - Document( - content="1", - meta={ - "number": -10, - }, - ), - Document( - content="2", - meta={ - "number": 100, - }, - ), - ] - await document_store.write_documents_async(filterable_docs) - result = await document_store.filter_documents_async( - filters={"field": "meta.number", "operator": "==", "value": 100} - ) - - assert result == [d for d in filterable_docs if d.meta.get("number") == 100] - - async def test_delete_documents(self, document_store: PgvectorDocumentStore): - doc = Document(content="test doc") - await document_store.write_documents_async([doc]) - assert await document_store.count_documents_async() == 1 - - await document_store.delete_documents_async([doc.id]) - assert await document_store.count_documents_async() == 0 - async def test_connection_check_and_recreation(self, document_store: PgvectorDocumentStore): await document_store._ensure_db_setup_async() original_connection = document_store._async_connection @@ -94,13 +76,6 @@ async def test_connection_check_and_recreation(self, document_store: PgvectorDoc same_connection = document_store._async_connection assert same_connection is document_store._async_connection - async def test_delete_all_documents_async(self, document_store: PgvectorDocumentStore) -> None: - document_store.write_documents([Document(id=str(i)) for i in range(10)]) - await document_store.delete_all_documents_async() - assert document_store.count_documents() == 0 - document_store.write_documents([Document(id="1")]) - assert document_store.count_documents() == 1 - async def test_invalid_connection_string(self, monkeypatch): monkeypatch.setenv("PG_CONN_STR", "invalid_connection_string") document_store = PgvectorDocumentStore() @@ -108,215 +83,6 @@ async def test_invalid_connection_string(self, monkeypatch): await document_store._ensure_db_setup_async() assert "Failed to connect to PostgreSQL database" in str(e) - async def test_delete_by_filter_async(self, document_store: PgvectorDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023}), - Document(content="Doc 2", meta={"category": "B", "year": 2023}), - Document(content="Doc 3", meta={"category": "A", "year": 2024}), - ] - await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 3 - - deleted_count = await document_store.delete_by_filter_async( - filters={"field": "meta.category", "operator": "==", "value": "A"} - ) - assert deleted_count == 2 - assert await document_store.count_documents_async() == 1 - - remaining_docs = await document_store.filter_documents_async() - assert len(remaining_docs) == 1 - assert remaining_docs[0].meta["category"] == "B" - - deleted_count = await document_store.delete_by_filter_async( - filters={"field": "meta.year", "operator": "==", "value": 2023} - ) - assert deleted_count == 1 - assert await document_store.count_documents_async() == 0 - - async def test_delete_by_filter_async_no_matches(self, document_store: PgvectorDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - ] - await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 2 - - deleted_count = await document_store.delete_by_filter_async( - filters={"field": "meta.category", "operator": "==", "value": "C"} - ) - assert deleted_count == 0 - assert await document_store.count_documents_async() == 2 - - async def test_delete_by_filter_async_advanced_filters(self, document_store: PgvectorDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023, "status": "draft"}), - Document(content="Doc 2", meta={"category": "A", "year": 2024, "status": "published"}), - Document(content="Doc 3", meta={"category": "B", "year": 2023, "status": "draft"}), - ] - await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 3 - - # AND condition - deleted_count = await document_store.delete_by_filter_async( - filters={ - "operator": "AND", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "A"}, - {"field": "meta.year", "operator": "==", "value": 2023}, - ], - } - ) - assert deleted_count == 1 - assert await document_store.count_documents_async() == 2 - - # OR condition - deleted_count = await document_store.delete_by_filter_async( - filters={ - "operator": "OR", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "B"}, - {"field": "meta.status", "operator": "==", "value": "published"}, - ], - } - ) - assert deleted_count == 2 - assert await document_store.count_documents_async() == 0 - - async def test_update_by_filter_async(self, document_store: PgvectorDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "draft"}), - Document(content="Doc 2", meta={"category": "B", "status": "draft"}), - Document(content="Doc 3", meta={"category": "A", "status": "draft"}), - ] - await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 3 - - # Update status for category="A" documents - updated_count = await document_store.update_by_filter_async( - filters={"field": "meta.category", "operator": "==", "value": "A"}, meta={"status": "published"} - ) - assert updated_count == 2 - - # Verify the updates - published_docs = await document_store.filter_documents_async( - filters={"field": "meta.status", "operator": "==", "value": "published"} - ) - assert len(published_docs) == 2 - for doc in published_docs: - assert doc.meta["category"] == "A" - assert doc.meta["status"] == "published" - - # Verify category B still has draft status - draft_docs = await document_store.filter_documents_async( - filters={"field": "meta.status", "operator": "==", "value": "draft"} - ) - assert len(draft_docs) == 1 - assert draft_docs[0].meta["category"] == "B" - - async def test_update_by_filter_async_multiple_fields(self, document_store: PgvectorDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023}), - Document(content="Doc 2", meta={"category": "A", "year": 2023}), - Document(content="Doc 3", meta={"category": "B", "year": 2024}), - ] - await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 3 - - # update multiple fields for category="A" documents - updated_count = await document_store.update_by_filter_async( - filters={"field": "meta.category", "operator": "==", "value": "A"}, - meta={"status": "published", "priority": "high", "reviewed": True}, - ) - assert updated_count == 2 - - # verify - published_docs = await document_store.filter_documents_async( - filters={"field": "meta.category", "operator": "==", "value": "A"} - ) - assert len(published_docs) == 2 - for doc in published_docs: - assert doc.meta["status"] == "published" - assert doc.meta["priority"] == "high" - assert doc.meta["reviewed"] is True - assert doc.meta["year"] == 2023 # Original field should still be present - - # verify category B was not updated - b_docs = await document_store.filter_documents_async( - filters={"field": "meta.category", "operator": "==", "value": "B"} - ) - assert len(b_docs) == 1 - assert "status" not in b_docs[0].meta - assert "priority" not in b_docs[0].meta - - async def test_update_by_filter_async_no_matches(self, document_store: PgvectorDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - ] - await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 2 - - # update documents with category="C" (no matches) - updated_count = await document_store.update_by_filter_async( - filters={"field": "meta.category", "operator": "==", "value": "C"}, meta={"status": "published"} - ) - assert updated_count == 0 - assert await document_store.count_documents_async() == 2 - - # verify no documents were updated - published_docs = await document_store.filter_documents_async( - filters={"field": "meta.status", "operator": "==", "value": "published"} - ) - assert len(published_docs) == 0 - - async def test_update_by_filter_async_advanced_filters(self, document_store: PgvectorDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023, "status": "draft"}), - Document(content="Doc 2", meta={"category": "A", "year": 2024, "status": "draft"}), - Document(content="Doc 3", meta={"category": "B", "year": 2023, "status": "draft"}), - ] - await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 3 - - # AND condition - updated_count = await document_store.update_by_filter_async( - filters={ - "operator": "AND", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "A"}, - {"field": "meta.year", "operator": "==", "value": 2023}, - ], - }, - meta={"status": "published"}, - ) - assert updated_count == 1 - - # verify - published_docs = await document_store.filter_documents_async( - filters={"field": "meta.status", "operator": "==", "value": "published"} - ) - assert len(published_docs) == 1 - assert published_docs[0].meta["category"] == "A" - assert published_docs[0].meta["year"] == 2023 - - # OR condition - updated_count = await document_store.update_by_filter_async( - filters={ - "operator": "OR", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "B"}, - {"field": "meta.year", "operator": "==", "value": 2024}, - ], - }, - meta={"featured": True}, - ) - assert updated_count == 2 - - featured_docs = await document_store.filter_documents_async( - filters={"field": "meta.featured", "operator": "==", "value": True} - ) - assert len(featured_docs) == 2 - async def test_update_by_filter_async_empty_meta_raises_error(self, document_store: PgvectorDocumentStore): docs = [Document(content="Doc 1", meta={"category": "A"})] await document_store.write_documents_async(docs) @@ -448,352 +214,3 @@ async def test_delete_table_async_first_call(document_store): without triggering errors due to an uninitialized state. """ await document_store.delete_table_async() # if throw error, test fails - - -@pytest.mark.integration -@pytest.mark.asyncio -async def test_count_documents_by_filter_async(document_store: PgvectorDocumentStore): - filterable_docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "active"}), - Document(content="Doc 2", meta={"category": "B", "status": "active"}), - Document(content="Doc 3", meta={"category": "A", "status": "inactive"}), - Document(content="Doc 4", meta={"category": "A", "status": "active"}), - ] - await document_store.write_documents_async(filterable_docs) - assert await document_store.count_documents_async() == 4 - - count_a = await document_store.count_documents_by_filter_async( - filters={"field": "meta.category", "operator": "==", "value": "A"} - ) - assert count_a == 3 - - count_active = await document_store.count_documents_by_filter_async( - filters={"field": "meta.status", "operator": "==", "value": "active"} - ) - assert count_active == 3 - - count_a_active = await document_store.count_documents_by_filter_async( - filters={ - "operator": "AND", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "A"}, - {"field": "meta.status", "operator": "==", "value": "active"}, - ], - } - ) - assert count_a_active == 2 - - -@pytest.mark.integration -@pytest.mark.asyncio -async def test_count_unique_metadata_by_filter_async(document_store: PgvectorDocumentStore): - filterable_docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1}), - Document(content="Doc 2", meta={"category": "B", "status": "active", "priority": 2}), - Document(content="Doc 3", meta={"category": "A", "status": "inactive", "priority": 1}), - Document(content="Doc 4", meta={"category": "A", "status": "active", "priority": 3}), - Document(content="Doc 5", meta={"category": "C", "status": "active", "priority": 2}), - ] - await document_store.write_documents_async(filterable_docs) - assert await document_store.count_documents_async() == 5 - - # count distinct values for all documents - distinct_counts = await document_store.count_unique_metadata_by_filter_async( - filters={}, metadata_fields=["category", "status", "priority"] - ) - assert distinct_counts["category"] == 3 # A, B, C - assert distinct_counts["status"] == 2 # active, inactive - assert distinct_counts["priority"] == 3 # 1, 2, 3 - - # count distinct values for documents with category="A" - distinct_counts_a = await document_store.count_unique_metadata_by_filter_async( - filters={"field": "meta.category", "operator": "==", "value": "A"}, - metadata_fields=["category", "status", "priority"], - ) - assert distinct_counts_a["category"] == 1 # Only A - assert distinct_counts_a["status"] == 2 # active, inactive - assert distinct_counts_a["priority"] == 2 # 1, 3 - - # count distinct values for documents with status="active" - distinct_counts_active = await document_store.count_unique_metadata_by_filter_async( - filters={"field": "meta.status", "operator": "==", "value": "active"}, - metadata_fields=["category", "status", "priority"], - ) - assert distinct_counts_active["category"] == 3 # A, B, C - assert distinct_counts_active["status"] == 1 # Only active - assert distinct_counts_active["priority"] == 3 # 1, 2, 3 - - # count distinct values with complex filter (category="A" AND status="active") - distinct_counts_a_active = await document_store.count_unique_metadata_by_filter_async( - filters={ - "operator": "AND", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "A"}, - {"field": "meta.status", "operator": "==", "value": "active"}, - ], - }, - metadata_fields=["category", "status", "priority"], - ) - assert distinct_counts_a_active["category"] == 1 # Only A - assert distinct_counts_a_active["status"] == 1 # Only active - assert distinct_counts_a_active["priority"] == 2 # 1, 3 - - # Test with only a subset of fields - distinct_counts_subset = await document_store.count_unique_metadata_by_filter_async( - filters={}, metadata_fields=["category", "status"] - ) - assert distinct_counts_subset["category"] == 3 - assert distinct_counts_subset["status"] == 2 - assert "priority" not in distinct_counts_subset - - # Test field name normalization (with "meta." prefix) - distinct_counts_normalized = await document_store.count_unique_metadata_by_filter_async( - filters={}, metadata_fields=["meta.category", "status", "meta.priority"] - ) - assert distinct_counts_normalized["category"] == 3 - assert distinct_counts_normalized["status"] == 2 - assert distinct_counts_normalized["priority"] == 3 - - -@pytest.mark.integration -@pytest.mark.asyncio -async def test_get_metadata_fields_info_async(document_store: PgvectorDocumentStore): - filterable_docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1}), - Document(content="Doc 2", meta={"category": "B", "status": "inactive"}), - ] - await document_store.write_documents_async(filterable_docs) - - fields_info = await document_store.get_metadata_fields_info_async() - - # Verify that fields_info contains expected fields - assert "content" in fields_info - assert "category" in fields_info - assert "status" in fields_info - assert "priority" in fields_info - - assert fields_info["content"]["type"] == "text" - assert fields_info["category"]["type"] == "text" - assert fields_info["status"]["type"] == "text" - assert fields_info["priority"]["type"] == "integer" - - -@pytest.mark.integration -@pytest.mark.asyncio -async def test_get_metadata_field_min_max_async(document_store: PgvectorDocumentStore): - # Test with integer values - docs = [ - Document(content="Doc 1", meta={"priority": 1, "age": 10}), - Document(content="Doc 2", meta={"priority": 5, "age": 20}), - Document(content="Doc 3", meta={"priority": 3, "age": 15}), - Document(content="Doc 4", meta={"priority": 10, "age": 5}), - Document(content="Doc 6", meta={"rating": 10.5}), - Document(content="Doc 7", meta={"rating": 20.3}), - Document(content="Doc 8", meta={"rating": 15.7}), - Document(content="Doc 9", meta={"rating": 5.2}), - ] - await document_store.write_documents_async(docs) - - # Test with "meta." prefix for integer field - min_max_priority = await document_store.get_metadata_field_min_max_async("meta.priority") - assert min_max_priority["min"] == 1 - assert min_max_priority["max"] == 10 - - # Test with "meta." prefix for another integer field - min_max_age = await document_store.get_metadata_field_min_max_async("meta.age") - assert min_max_age["min"] == 5 - assert min_max_age["max"] == 20 - - # Test with single value - single_doc = [Document(content="Doc 5", meta={"single_value": 42})] - await document_store.write_documents_async(single_doc) - min_max_single = await document_store.get_metadata_field_min_max_async("meta.single_value") - assert min_max_single["min"] == 42 - assert min_max_single["max"] == 42 - - # Test with float values - min_max_rating = await document_store.get_metadata_field_min_max_async("meta.rating") - assert min_max_rating["min"] == pytest.approx(5.2) - assert min_max_rating["max"] == pytest.approx(20.3) - - # Test with text/string values - lexicographic comparison - text_docs = [ - Document(content="Doc 1", meta={"category": "Zebra", "status": "active"}), - Document(content="Doc 2", meta={"category": "Apple", "status": "pending"}), - Document(content="Doc 3", meta={"category": "Banana", "status": "inactive"}), - Document(content="Doc 4", meta={"category": "apple", "status": "active"}), - ] - await document_store.write_documents_async(text_docs) - - # Test lexicographic min/max for text fields (case-sensitive) - min_max_category = await document_store.get_metadata_field_min_max_async("meta.category") - assert min_max_category["min"] == "Apple" # 'A' comes before 'B' and 'Z' and 'a' - assert min_max_category["max"] == "apple" # 'a' comes after 'A', 'B', 'Z' in ASCII - - min_max_status = await document_store.get_metadata_field_min_max_async("meta.status") - assert min_max_status["min"] == "active" # 'a' comes before 'i' and 'p' - assert min_max_status["max"] == "pending" # 'p' comes after 'a' and 'i' - - # Test with empty strings - empty_string_docs = [ - Document(content="Doc 1", meta={"tag": ""}), - Document(content="Doc 2", meta={"tag": "A"}), - Document(content="Doc 3", meta={"tag": "B"}), - ] - await document_store.write_documents_async(empty_string_docs) - min_max_tag = await document_store.get_metadata_field_min_max_async("meta.tag") - assert min_max_tag["min"] == "" # Empty string is typically minimum - assert min_max_tag["max"] == "B" # 'B' is maximum - - # Test with special characters - special_char_docs = [ - Document(content="Doc 1", meta={"code": "!@#"}), - Document(content="Doc 2", meta={"code": "$%^"}), - Document(content="Doc 3", meta={"code": "&*()"}), - ] - await document_store.write_documents_async(special_char_docs) - min_max_code = await document_store.get_metadata_field_min_max_async("meta.code") - # Special characters have specific ASCII ordering - assert min_max_code["min"] in ["!@#", "$%^", "&*()"] - assert min_max_code["max"] in ["!@#", "$%^", "&*()"] - - # Test with Unicode characters - unicode_docs = [ - Document(content="Doc 1", meta={"name": "Ángel"}), - Document(content="Doc 2", meta={"name": "Zebra"}), - Document(content="Doc 3", meta={"name": "Alpha"}), - ] - await document_store.write_documents_async(unicode_docs) - min_max_name = await document_store.get_metadata_field_min_max_async("meta.name") - # With COLLATE "C", comparison is byte-order based - # "Alpha" should be minimum (A comes first in ASCII) - # "Ángel" or "Zebra" will be maximum depending on byte encoding - assert min_max_name["min"] == "Alpha" # 'A' comes first in ASCII - assert min_max_name["max"] in ["Ángel", "Zebra"] # Depends on UTF-8 byte encoding - - -@pytest.mark.integration -@pytest.mark.asyncio -async def test_get_metadata_field_unique_values_async(document_store: PgvectorDocumentStore): - # Test with string values - docs = [ - Document(content="Python programming", meta={"category": "A", "language": "Python"}), - Document(content="Java programming", meta={"category": "B", "language": "Java"}), - Document(content="Python scripting", meta={"category": "A", "language": "Python"}), - Document(content="JavaScript development", meta={"category": "C", "language": "JavaScript"}), - Document(content="Python data science", meta={"category": "A", "language": "Python"}), - Document(content="Java backend", meta={"category": "B", "language": "Java"}), - ] - await document_store.write_documents_async(docs) - - # Test getting all unique values without search term - unique_values, total_count = await document_store.get_metadata_field_unique_values_async( - "meta.category", None, 0, 10 - ) - assert set(unique_values) == {"A", "B", "C"} - assert total_count == 3 - - # Test with "meta." prefix - unique_languages, total_languages = await document_store.get_metadata_field_unique_values_async( - "meta.language", None, 0, 10 - ) - assert set(unique_languages) == {"Python", "Java", "JavaScript"} - assert total_languages == 3 - - # Test pagination - first page - unique_values_page1, total_count_page1 = await document_store.get_metadata_field_unique_values_async( - "meta.category", None, 0, 2 - ) - assert len(unique_values_page1) == 2 - assert all(val in ["A", "B", "C"] for val in unique_values_page1) - assert total_count_page1 == 3 - - # Test pagination - second page - unique_values_page2, total_count_page2 = await document_store.get_metadata_field_unique_values_async( - "meta.category", None, 2, 2 - ) - assert len(unique_values_page2) == 1 - assert unique_values_page2[0] in ["A", "B", "C"] - assert total_count_page2 == 3 - - # Test pagination - verify pages don't overlap - assert not set(unique_values_page1).intersection(set(unique_values_page2)) - - # Test pagination - verify all values are covered - all_values = set(unique_values_page1) | set(unique_values_page2) - assert all_values == {"A", "B", "C"} - - # Test pagination - size larger than total count - unique_values_large, total_large = await document_store.get_metadata_field_unique_values_async( - "meta.category", None, 0, 100 - ) - assert len(unique_values_large) == 3 - assert set(unique_values_large) == {"A", "B", "C"} - assert total_large == 3 - - # Test pagination - from_ beyond total count (should return empty) - unique_values_beyond, total_beyond = await document_store.get_metadata_field_unique_values_async( - "meta.category", None, 10, 10 - ) - assert len(unique_values_beyond) == 0 - assert total_beyond == 3 - - # Test pagination - single item per page - unique_values_single1, _ = await document_store.get_metadata_field_unique_values_async("meta.category", None, 0, 1) - unique_values_single2, _ = await document_store.get_metadata_field_unique_values_async("meta.category", None, 1, 1) - unique_values_single3, _ = await document_store.get_metadata_field_unique_values_async("meta.category", None, 2, 1) - assert len(unique_values_single1) == 1 - assert len(unique_values_single2) == 1 - assert len(unique_values_single3) == 1 - # All three pages should be different - assert len(set(unique_values_single1 + unique_values_single2 + unique_values_single3)) == 3 - - # Test with search term - filter by content matching "Python" - unique_values_filtered, total_filtered = await document_store.get_metadata_field_unique_values_async( - "meta.category", "Python", 0, 10 - ) - assert set(unique_values_filtered) == {"A"} # Only category A has documents with "Python" in content - assert total_filtered == 1 - - # Test with search term - filter by content matching "Java" - unique_values_java, total_java = await document_store.get_metadata_field_unique_values_async( - "meta.category", "Java", 0, 10 - ) - assert set(unique_values_java) == {"B"} # Only category B has documents with "Java" in content - assert total_java == 1 - - # Test pagination with search term - unique_values_search_page1, total_search = await document_store.get_metadata_field_unique_values_async( - "meta.language", "Python", 0, 1 - ) - assert len(unique_values_search_page1) == 1 - assert unique_values_search_page1[0] == "Python" - assert total_search == 1 - - # Test pagination with search term - beyond results - unique_values_search_empty, total_search_empty = await document_store.get_metadata_field_unique_values_async( - "meta.language", "Python", 10, 10 - ) - assert len(unique_values_search_empty) == 0 - assert total_search_empty == 1 - - # Test with integer values - int_docs = [ - Document(content="Doc 1", meta={"priority": 1}), - Document(content="Doc 2", meta={"priority": 2}), - Document(content="Doc 3", meta={"priority": 1}), - Document(content="Doc 4", meta={"priority": 3}), - ] - await document_store.write_documents_async(int_docs) - unique_priorities, total_priorities = await document_store.get_metadata_field_unique_values_async( - "meta.priority", None, 0, 10 - ) - assert set(unique_priorities) == {"1", "2", "3"} - assert total_priorities == 3 - - # Test with search term on integer field - unique_priorities_filtered, total_priorities_filtered = await document_store.get_metadata_field_unique_values_async( - "meta.priority", "Doc 1", 0, 10 - ) - assert set(unique_priorities_filtered) == {"1"} - assert total_priorities_filtered == 1 From 3ac764bb09ad9cb8cf7ee59d4cd181b283315ec5 Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Fri, 10 Apr 2026 12:57:49 +0500 Subject: [PATCH 03/10] revert: restore qdrant async tests unchanged This PR only covers pgvector. The qdrant async test refactor was accidentally included - reverting to keep scope correct. --- .../qdrant/tests/test_document_store_async.py | 430 ++++++++++++++++-- 1 file changed, 397 insertions(+), 33 deletions(-) diff --git a/integrations/qdrant/tests/test_document_store_async.py b/integrations/qdrant/tests/test_document_store_async.py index 4e9f3ed9b8..ce5f7a5c87 100644 --- a/integrations/qdrant/tests/test_document_store_async.py +++ b/integrations/qdrant/tests/test_document_store_async.py @@ -1,26 +1,13 @@ from unittest.mock import MagicMock, patch import pytest -import pytest_asyncio from haystack import Document from haystack.dataclasses import SparseEmbedding +from haystack.document_stores.errors import DuplicateDocumentError +from haystack.document_stores.types import DuplicatePolicy from haystack.testing.document_store import ( - CountDocumentsByFilterAsyncTest, - CountUniqueMetadataByFilterAsyncTest, - UpdateByFilterAsyncTest, _random_embeddings, ) -from haystack.testing.document_store_async import ( - CountDocumentsAsyncTest, - DeleteAllAsyncTest, - DeleteByFilterAsyncTest, - DeleteDocumentsAsyncTest, - FilterDocumentsAsyncTest, - GetMetadataFieldMinMaxAsyncTest, - GetMetadataFieldsInfoAsyncTest, - GetMetadataFieldUniqueValuesAsyncTest, - WriteDocumentsAsyncTest, -) from qdrant_client.http import models as rest from haystack_integrations.document_stores.qdrant.document_store import ( @@ -48,6 +35,7 @@ async def test_query_hybrid_search_batch_failure_async(self): async def test_set_up_collection_with_dimension_mismatch_async(self): document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=False, similarity="cosine") await document_store._initialize_async_client() + # Mock collection info with different vector size mock_collection_info = MagicMock() mock_collection_info.config.params.vectors = MagicMock() mock_collection_info.config.params.vectors.distance = rest.Distance.COSINE @@ -63,6 +51,7 @@ async def test_set_up_collection_with_dimension_mismatch_async(self): async def test_set_up_collection_with_existing_incompatible_collection_async(self): document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True) await document_store._initialize_async_client() + # Mock collection info with named vectors but missing DENSE_VECTORS_NAME mock_collection_info = MagicMock() mock_collection_info.config.params.vectors = {"some_other_vector": MagicMock()} @@ -77,6 +66,8 @@ async def test_set_up_collection_use_sparse_embeddings_true_without_named_vector """Test that an error is raised when use_sparse_embeddings is True but collection doesn't have named vectors""" document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=True) await document_store._initialize_async_client() + + # Mock collection info without named vectors mock_collection_info = MagicMock() mock_collection_info.config.params.vectors = MagicMock(spec=rest.VectorsConfig) @@ -91,6 +82,7 @@ async def test_set_up_collection_use_sparse_embeddings_false_with_named_vectors_ """Test that an error is raised when use_sparse_embeddings is False but collection has named vectors""" document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=False) await document_store._initialize_async_client() + # Mock collection info with named vectors mock_collection_info = MagicMock() mock_collection_info.config.params.vectors = {DENSE_VECTORS_NAME: MagicMock()} @@ -104,6 +96,8 @@ async def test_set_up_collection_use_sparse_embeddings_false_with_named_vectors_ async def test_set_up_collection_with_distance_mismatch_async(self): document_store = QdrantDocumentStore(location=":memory:", use_sparse_embeddings=False, similarity="cosine") await document_store._initialize_async_client() + + # Mock collection info with different distance mock_collection_info = MagicMock() mock_collection_info.config.params.vectors = MagicMock() mock_collection_info.config.params.vectors.distance = rest.Distance.DOT @@ -119,23 +113,10 @@ async def test_set_up_collection_with_distance_mismatch_async(self): @pytest.mark.integration @pytest.mark.asyncio -class TestQdrantDocumentStoreAsync( - CountDocumentsAsyncTest, - WriteDocumentsAsyncTest, - DeleteDocumentsAsyncTest, - DeleteAllAsyncTest, - DeleteByFilterAsyncTest, - FilterDocumentsAsyncTest, - UpdateByFilterAsyncTest, - CountDocumentsByFilterAsyncTest, - CountUniqueMetadataByFilterAsyncTest, - GetMetadataFieldsInfoAsyncTest, - GetMetadataFieldMinMaxAsyncTest, - GetMetadataFieldUniqueValuesAsyncTest, -): - @pytest_asyncio.fixture - async def document_store(self): - store = QdrantDocumentStore( +class TestQdrantDocumentStoreAsync: + @pytest.fixture + def document_store(self) -> QdrantDocumentStore: + return QdrantDocumentStore( ":memory:", recreate_index=True, return_embedding=True, @@ -143,7 +124,14 @@ async def document_store(self): use_sparse_embeddings=False, progress_bar=False, ) - yield store + + async def test_write_documents_async(self, document_store: QdrantDocumentStore): + docs = [Document(id="1")] + + result = await document_store.write_documents_async(docs) + assert result == 1 + with pytest.raises(DuplicateDocumentError): + await document_store.write_documents_async(docs, DuplicatePolicy.FAIL) async def test_sparse_configuration_async(self): document_store = QdrantDocumentStore( @@ -158,6 +146,8 @@ async def test_sparse_configuration_async(self): sparse_config = collection.config.params.sparse_vectors assert SPARSE_VECTORS_NAME in sparse_config + + # check that the `sparse_idf` parameter takes effect assert hasattr(sparse_config[SPARSE_VECTORS_NAME], "modifier") assert sparse_config[SPARSE_VECTORS_NAME].modifier == rest.Modifier.IDF @@ -228,6 +218,235 @@ async def test_query_hybrid_fail_without_sparse_embedding_async(self, document_s query_embedding=embedding, ) + async def test_delete_all_documents_async_no_index_recreation(self, document_store): + await document_store._initialize_async_client() + + # write some documents + docs = [Document(id=str(i)) for i in range(5)] + await document_store.write_documents_async(docs) + + # delete all documents without recreating the index + await document_store.delete_all_documents_async(recreate_index=False) + assert await document_store.count_documents_async() == 0 + + # ensure the collection still exists by writing documents again + await document_store.write_documents_async(docs) + assert await document_store.count_documents_async() == 5 + + async def test_delete_all_documents_async_index_recreation(self, document_store): + await document_store._initialize_async_client() + + # write some documents + docs = [Document(id=str(i)) for i in range(5)] + await document_store.write_documents_async(docs) + + # get the current document_store config + config_before = await document_store._async_client.get_collection(document_store.index) + + # delete all documents with recreating the index + await document_store.delete_all_documents_async(recreate_index=True) + assert await document_store.count_documents_async() == 0 + + # assure that with the same config + config_after = await document_store._async_client.get_collection(document_store.index) + + assert config_before.config == config_after.config + + # ensure the collection still exists by writing documents again + await document_store.write_documents_async(docs) + assert await document_store.count_documents_async() == 5 + + async def test_delete_by_filter_async(self, document_store: QdrantDocumentStore): + docs = [ + Document(content="Doc 1", meta={"category": "A", "year": 2023}), + Document(content="Doc 2", meta={"category": "B", "year": 2023}), + Document(content="Doc 3", meta={"category": "A", "year": 2024}), + ] + await document_store.write_documents_async(docs) + assert await document_store.count_documents_async() == 3 + + # Delete documents with category="A" + deleted_count = await document_store.delete_by_filter_async( + filters={"field": "meta.category", "operator": "==", "value": "A"} + ) + assert deleted_count == 2 + assert await document_store.count_documents_async() == 1 + + # Verify only category B remains + remaining_docs = [] + async for doc in document_store._get_documents_generator_async(): + remaining_docs.append(doc) + assert len(remaining_docs) == 1 + assert remaining_docs[0].meta["category"] == "B" + + # Delete remaining document by year + deleted_count = await document_store.delete_by_filter_async( + filters={"field": "meta.year", "operator": "==", "value": 2023} + ) + assert deleted_count == 1 + assert await document_store.count_documents_async() == 0 + + async def test_delete_by_filter_async_no_matches(self, document_store: QdrantDocumentStore): + docs = [ + Document(content="Doc 1", meta={"category": "A"}), + Document(content="Doc 2", meta={"category": "B"}), + ] + await document_store.write_documents_async(docs) + assert await document_store.count_documents_async() == 2 + + # Try to delete documents with category="C" (no matches) + deleted_count = await document_store.delete_by_filter_async( + filters={"field": "meta.category", "operator": "==", "value": "C"} + ) + assert deleted_count == 0 + assert await document_store.count_documents_async() == 2 + + async def test_delete_by_filter_async_advanced_filters(self, document_store: QdrantDocumentStore): + docs = [ + Document(content="Doc 1", meta={"category": "A", "year": 2023, "status": "draft"}), + Document(content="Doc 2", meta={"category": "A", "year": 2024, "status": "published"}), + Document(content="Doc 3", meta={"category": "B", "year": 2023, "status": "draft"}), + ] + await document_store.write_documents_async(docs) + assert await document_store.count_documents_async() == 3 + + # AND condition (matches only Doc 1) + deleted_count = await document_store.delete_by_filter_async( + filters={ + "operator": "AND", + "conditions": [ + {"field": "meta.category", "operator": "==", "value": "A"}, + {"field": "meta.year", "operator": "==", "value": 2023}, + ], + } + ) + assert deleted_count == 1 + assert await document_store.count_documents_async() == 2 + + # OR condition (matches Doc 2 and Doc 3) + deleted_count = await document_store.delete_by_filter_async( + filters={ + "operator": "OR", + "conditions": [ + {"field": "meta.category", "operator": "==", "value": "B"}, + {"field": "meta.status", "operator": "==", "value": "published"}, + ], + } + ) + assert deleted_count == 2 + assert await document_store.count_documents_async() == 0 + + async def test_update_by_filter_async(self, document_store: QdrantDocumentStore): + docs = [ + Document(content="Doc 1", meta={"category": "A", "status": "draft"}), + Document(content="Doc 2", meta={"category": "B", "status": "draft"}), + Document(content="Doc 3", meta={"category": "A", "status": "draft"}), + ] + await document_store.write_documents_async(docs) + assert await document_store.count_documents_async() == 3 + + # Update status for category="A" documents + updated_count = await document_store.update_by_filter_async( + filters={"field": "meta.category", "operator": "==", "value": "A"}, meta={"status": "published"} + ) + assert updated_count == 2 + + # Verify the updated documents have the new metadata + published_docs = [] + async for doc in document_store._get_documents_generator_async( + filters={"field": "meta.status", "operator": "==", "value": "published"} + ): + published_docs.append(doc) + assert len(published_docs) == 2 + for doc in published_docs: + assert doc.meta["status"] == "published" + assert doc.meta["category"] == "A" + + # Verify documents with category="B" were not updated + draft_docs = [] + async for doc in document_store._get_documents_generator_async( + filters={"field": "meta.status", "operator": "==", "value": "draft"} + ): + draft_docs.append(doc) + assert len(draft_docs) == 1 + assert draft_docs[0].meta["category"] == "B" + + async def test_update_by_filter_async_multiple_fields(self, document_store: QdrantDocumentStore): + docs = [ + Document(content="Doc 1", meta={"category": "A", "year": 2023}), + Document(content="Doc 2", meta={"category": "A", "year": 2023}), + Document(content="Doc 3", meta={"category": "B", "year": 2024}), + ] + await document_store.write_documents_async(docs) + assert await document_store.count_documents_async() == 3 + + # Update multiple fields for category="A" documents + updated_count = await document_store.update_by_filter_async( + filters={"field": "meta.category", "operator": "==", "value": "A"}, + meta={"status": "published", "reviewed": True}, + ) + assert updated_count == 2 + + # Verify updates + published_docs = [] + async for doc in document_store._get_documents_generator_async( + filters={"field": "meta.status", "operator": "==", "value": "published"} + ): + published_docs.append(doc) + assert len(published_docs) == 2 + for doc in published_docs: + assert doc.meta["status"] == "published" + assert doc.meta["reviewed"] is True + assert doc.meta["category"] == "A" + assert doc.meta["year"] == 2023 # Existing field preserved + + async def test_update_by_filter_async_no_matches(self, document_store: QdrantDocumentStore): + docs = [ + Document(content="Doc 1", meta={"category": "A"}), + Document(content="Doc 2", meta={"category": "B"}), + ] + await document_store.write_documents_async(docs) + assert await document_store.count_documents_async() == 2 + + # Try to update documents with category="C" (no matches) + updated_count = await document_store.update_by_filter_async( + filters={"field": "meta.category", "operator": "==", "value": "C"}, meta={"status": "published"} + ) + assert updated_count == 0 + assert await document_store.count_documents_async() == 2 + + async def test_update_by_filter_async_advanced_filters(self, document_store: QdrantDocumentStore): + docs = [ + Document(content="Doc 1", meta={"category": "A", "year": 2023, "status": "draft"}), + Document(content="Doc 2", meta={"category": "A", "year": 2024, "status": "draft"}), + Document(content="Doc 3", meta={"category": "B", "year": 2023, "status": "draft"}), + ] + await document_store.write_documents_async(docs) + assert await document_store.count_documents_async() == 3 + + # Update with AND condition + updated_count = await document_store.update_by_filter_async( + filters={ + "operator": "AND", + "conditions": [ + {"field": "meta.category", "operator": "==", "value": "A"}, + {"field": "meta.year", "operator": "==", "value": 2023}, + ], + }, + meta={"status": "published"}, + ) + assert updated_count == 1 + + # Verify only one document was updated + published_docs = [] + async for doc in document_store._get_documents_generator_async( + filters={"field": "meta.status", "operator": "==", "value": "published"} + ): + published_docs.append(doc) + assert len(published_docs) == 1 + assert published_docs[0].meta["category"] == "A" + assert published_docs[0].meta["year"] == 2023 + async def test_update_by_filter_async_preserves_vectors(self, document_store: QdrantDocumentStore): """Test that update_by_filter_async preserves document embeddings.""" docs = [ @@ -236,11 +455,13 @@ async def test_update_by_filter_async_preserves_vectors(self, document_store: Qd ] await document_store.write_documents_async(docs) + # Update metadata updated_count = await document_store.update_by_filter_async( filters={"field": "meta.category", "operator": "==", "value": "A"}, meta={"status": "published"} ) assert updated_count == 1 + # Verify embedding is preserved updated_docs = [] async for doc in document_store._get_documents_generator_async( filters={"field": "meta.status", "operator": "==", "value": "published"} @@ -249,3 +470,146 @@ async def test_update_by_filter_async_preserves_vectors(self, document_store: Qd assert len(updated_docs) == 1 assert updated_docs[0].embedding is not None assert len(updated_docs[0].embedding) == 768 + + async def test_count_documents_by_filter_async(self, document_store: QdrantDocumentStore): + """Test counting documents with filters (async).""" + docs = [ + Document(content="Doc 1", meta={"category": "A", "year": 2023}), + Document(content="Doc 2", meta={"category": "A", "year": 2024}), + Document(content="Doc 3", meta={"category": "B", "year": 2023}), + Document(content="Doc 4", meta={"category": "B", "year": 2024}), + ] + await document_store.write_documents_async(docs) + + # Test counting all documents + count = await document_store.count_documents_async() + assert count == 4 + + # Test counting with single filter + count = await document_store.count_documents_by_filter_async( + filters={"field": "meta.category", "operator": "==", "value": "A"} + ) + assert count == 2 + + # Test counting with multiple filters + count = await document_store.count_documents_by_filter_async( + filters={ + "operator": "AND", + "conditions": [ + {"field": "meta.category", "operator": "==", "value": "B"}, + {"field": "meta.year", "operator": "==", "value": 2023}, + ], + } + ) + assert count == 1 + + async def test_get_metadata_fields_info_async(self, document_store: QdrantDocumentStore): + """Test getting metadata field information (async).""" + docs = [ + Document(content="Doc 1", meta={"category": "A", "score": 0.9, "tags": ["tag1", "tag2"]}), + Document(content="Doc 2", meta={"category": "B", "score": 0.8, "tags": ["tag2"]}), + ] + await document_store.write_documents_async(docs) + + fields_info = await document_store.get_metadata_fields_info_async() + # Should return empty dict or field info depending on Qdrant collection setup + assert isinstance(fields_info, dict) + + async def test_get_metadata_field_min_max_async(self, document_store: QdrantDocumentStore): + """Test getting min/max values for a metadata field (async).""" + docs = [ + Document(content="Doc 1", meta={"score": 0.5}), + Document(content="Doc 2", meta={"score": 0.8}), + Document(content="Doc 3", meta={"score": 0.3}), + ] + await document_store.write_documents_async(docs) + + result = await document_store.get_metadata_field_min_max_async("score") + assert result.get("min") == 0.3 + assert result.get("max") == 0.8 + + async def test_count_unique_metadata_by_filter_async(self, document_store: QdrantDocumentStore): + """Test counting unique metadata field values (async).""" + docs = [ + Document(content="Doc 1", meta={"category": "A"}), + Document(content="Doc 2", meta={"category": "B"}), + Document(content="Doc 3", meta={"category": "A"}), + Document(content="Doc 4", meta={"category": "C"}), + ] + await document_store.write_documents_async(docs) + + result = await document_store.count_unique_metadata_by_filter_async(filters={}, metadata_fields=["category"]) + assert result == {"category": 3} + + async def test_count_unique_metadata_by_filter_async_multiple_fields(self, document_store: QdrantDocumentStore): + """Test counting unique values for multiple metadata fields (async).""" + docs = [ + Document(content="Doc 1", meta={"category": "A", "status": "active"}), + Document(content="Doc 2", meta={"category": "B", "status": "active"}), + Document(content="Doc 3", meta={"category": "A", "status": "inactive"}), + ] + await document_store.write_documents_async(docs) + + result = await document_store.count_unique_metadata_by_filter_async( + filters={}, metadata_fields=["category", "status"] + ) + assert result == {"category": 2, "status": 2} + + async def test_count_unique_metadata_by_filter_async_with_filter(self, document_store: QdrantDocumentStore): + """Test counting unique metadata field values with filtering (async).""" + docs = [ + Document(content="Doc 1", meta={"category": "A", "status": "active"}), + Document(content="Doc 2", meta={"category": "B", "status": "active"}), + Document(content="Doc 3", meta={"category": "A", "status": "inactive"}), + ] + await document_store.write_documents_async(docs) + + result = await document_store.count_unique_metadata_by_filter_async( + filters={"field": "meta.status", "operator": "==", "value": "active"}, + metadata_fields=["category"], + ) + assert result == {"category": 2} + + async def test_get_metadata_field_unique_values_async(self, document_store: QdrantDocumentStore): + """Test getting unique metadata field values (async).""" + docs = [ + Document(content="Doc 1", meta={"category": "A"}), + Document(content="Doc 2", meta={"category": "B"}), + Document(content="Doc 3", meta={"category": "A"}), + Document(content="Doc 4", meta={"category": "C"}), + ] + await document_store.write_documents_async(docs) + + values = await document_store.get_metadata_field_unique_values_async("category") + assert len(values) == 3 + assert set(values) == {"A", "B", "C"} + + async def test_get_metadata_field_unique_values_async_pagination(self, document_store: QdrantDocumentStore): + """Test getting unique metadata field values with pagination (async).""" + docs = [Document(content=f"Doc {i}", meta={"value": i % 5}) for i in range(10)] + await document_store.write_documents_async(docs) + + # Get first 2 unique values + values_page_1 = await document_store.get_metadata_field_unique_values_async("value", limit=2, offset=0) + assert len(values_page_1) == 2 + + # Get next 2 unique values + values_page_2 = await document_store.get_metadata_field_unique_values_async("value", limit=2, offset=2) + assert len(values_page_2) == 2 + + # Values should not overlap + assert set(values_page_1) != set(values_page_2) + + async def test_get_metadata_field_unique_values_async_with_filter(self, document_store: QdrantDocumentStore): + """Test getting unique metadata field values with filtering (async).""" + docs = [ + Document(content="Doc 1", meta={"category": "A", "status": "active"}), + Document(content="Doc 2", meta={"category": "B", "status": "active"}), + Document(content="Doc 3", meta={"category": "A", "status": "inactive"}), + ] + await document_store.write_documents_async(docs) + + values = await document_store.get_metadata_field_unique_values_async( + "category", filters={"field": "meta.status", "operator": "==", "value": "active"} + ) + assert set(values) == {"A", "B"} From 1b630406f8a2a229b6b2ba94eb3f996c5f44d19a Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Fri, 10 Apr 2026 13:02:33 +0500 Subject: [PATCH 04/10] fix(pgvector): return None min/max for unknown/empty metadata field get_metadata_field_min_max and its async counterpart now return {"min": None, "max": None} when the field is not present in the store (e.g. empty collection) instead of raising ValueError. Fixes test_get_metadata_field_min_max_empty_collection_async. --- .../document_stores/pgvector/document_store.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py index edbb3fae87..65b2bd487e 100644 --- a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py +++ b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py @@ -1846,15 +1846,14 @@ def get_metadata_field_min_max(self, metadata_field: str) -> dict[str, Any]: :returns: A dictionary with 'min' and 'max' keys containing the minimum and maximum values. For numeric fields (integer, real), returns numeric min/max. For text fields, returns lexicographic min/max based on database collation. - :raises ValueError: If the field doesn't exist or has no values. + Returns ``{"min": None, "max": None}`` when the field has no values or the store is empty. """ normalized_field = PgvectorDocumentStore._normalize_metadata_field_name(metadata_field) # Get field type information from metadata fields info fields_info = self.get_metadata_fields_info() if normalized_field not in fields_info: - msg = f"Metadata field '{metadata_field}' not found in document store" - raise ValueError(msg) + return {"min": None, "max": None} field_type = fields_info[normalized_field]["type"] sql_query = self._build_min_max_query(normalized_field, field_type) @@ -1879,15 +1878,14 @@ async def get_metadata_field_min_max_async(self, metadata_field: str) -> dict[st :returns: A dictionary with 'min' and 'max' keys containing the minimum and maximum values. For numeric fields (integer, real), returns numeric min/max. For text fields, returns lexicographic min/max based on database collation. - :raises ValueError: If the field doesn't exist or has no values. + Returns ``{"min": None, "max": None}`` when the field has no values or the store is empty. """ normalized_field = PgvectorDocumentStore._normalize_metadata_field_name(metadata_field) # Get field type information from metadata fields info fields_info = await self.get_metadata_fields_info_async() if normalized_field not in fields_info: - msg = f"Metadata field '{metadata_field}' not found in document store" - raise ValueError(msg) + return {"min": None, "max": None} field_type = fields_info[normalized_field]["type"] sql_query = self._build_min_max_query(normalized_field, field_type) From 2e727152f9ac416981d009b9bacdf6a67a6c1eb1 Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Fri, 10 Apr 2026 13:07:26 +0500 Subject: [PATCH 05/10] fix(pgvector): update local empty-collection min/max test to expect None --- integrations/pgvector/tests/test_document_store.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/integrations/pgvector/tests/test_document_store.py b/integrations/pgvector/tests/test_document_store.py index 2b7e9613eb..1a5d425bea 100644 --- a/integrations/pgvector/tests/test_document_store.py +++ b/integrations/pgvector/tests/test_document_store.py @@ -51,11 +51,12 @@ def test_get_metadata_fields_info_empty_collection(self, document_store: Pgvecto assert fields_info == {"content": {"type": "text"}} def test_get_metadata_field_min_max_empty_collection(self, document_store: PgvectorDocumentStore): - """PgvectorDocumentStore raises ValueError when the field doesn't exist in the store.""" + """Returns None min/max when the field doesn't exist in the store.""" assert document_store.count_documents() == 0 - with pytest.raises(ValueError, match="not found in document store"): - document_store.get_metadata_field_min_max("priority") + result = document_store.get_metadata_field_min_max("priority") + assert result["min"] is None + assert result["max"] is None def test_write_documents(self, document_store: PgvectorDocumentStore): docs = [Document(id="1")] From ec66402edc9fce714dac38b59892055781f570d5 Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Fri, 10 Apr 2026 14:47:02 +0500 Subject: [PATCH 06/10] fix(pgvector): return empty dict from get_metadata_fields_info on empty store The hardcoded 'content' entry was always included in fields_info even when the store had no documents. The mixin contract (and common sense) requires {} for an empty store. Removed the hardcoded initialisation and updated the local test to match. --- .../document_stores/pgvector/document_store.py | 2 +- integrations/pgvector/tests/test_document_store.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py index 65b2bd487e..0a32bbbf3c 100644 --- a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py +++ b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py @@ -1669,7 +1669,7 @@ def _analyze_metadata_fields_from_records(records: list[dict[str, Any]]) -> dict :param records: List of database records containing 'meta' field. :returns: A dictionary mapping field names to their type information. """ - fields_info: dict[str, dict[str, str]] = {"content": {"type": "text"}} + fields_info: dict[str, dict[str, str]] = {} # Analyze metadata from all documents for record in records: diff --git a/integrations/pgvector/tests/test_document_store.py b/integrations/pgvector/tests/test_document_store.py index 1a5d425bea..d6cdfdb14f 100644 --- a/integrations/pgvector/tests/test_document_store.py +++ b/integrations/pgvector/tests/test_document_store.py @@ -44,11 +44,11 @@ class TestDocumentStore( GetMetadataFieldUniqueValuesTest, ): def test_get_metadata_fields_info_empty_collection(self, document_store: PgvectorDocumentStore): - """PgvectorDocumentStore always includes 'content' in fields info, even for empty stores.""" + """Returns empty dict when the store has no documents.""" assert document_store.count_documents() == 0 fields_info = document_store.get_metadata_fields_info() - assert fields_info == {"content": {"type": "text"}} + assert fields_info == {} def test_get_metadata_field_min_max_empty_collection(self, document_store: PgvectorDocumentStore): """Returns None min/max when the field doesn't exist in the store.""" From 3160494e051c499cef6a2398e4d9a7895fe76088 Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Fri, 10 Apr 2026 14:54:02 +0500 Subject: [PATCH 07/10] fix(pgvector): override assert_documents_are_equal in async test class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pgvector stores embeddings as float32, so round-tripped embeddings lose precision and exact Document equality fails. The same override already exists in test_filters.py for the sync FilterDocumentsTest — applying the same pattern to TestDocumentStoreAsync. --- .../tests/test_document_store_async.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/integrations/pgvector/tests/test_document_store_async.py b/integrations/pgvector/tests/test_document_store_async.py index 14a938655f..27e088109c 100644 --- a/integrations/pgvector/tests/test_document_store_async.py +++ b/integrations/pgvector/tests/test_document_store_async.py @@ -46,6 +46,25 @@ class TestDocumentStoreAsync( GetMetadataFieldMinMaxAsyncTest, GetMetadataFieldUniqueValuesAsyncTest, ): + @staticmethod + def assert_documents_are_equal(received: list[Document], expected: list[Document]): + """ + Embeddings lose float32 precision when round-tripped through pgvector, so we + compare them approximately and then do an exact equality check on the rest. + """ + import pytest + + assert len(received) == len(expected) + received.sort(key=lambda x: x.id) + expected.sort(key=lambda x: x.id) + for received_doc, expected_doc in zip(received, expected, strict=True): + if received_doc.embedding is None: + assert expected_doc.embedding is None + else: + assert received_doc.embedding == pytest.approx(expected_doc.embedding) + received_doc.embedding, expected_doc.embedding = None, None + assert received_doc == expected_doc + async def test_write_blob(self, document_store: PgvectorDocumentStore): bytestream = ByteStream(b"test", meta={"meta_key": "meta_value"}, mime_type="mime_type") docs = [Document(id="1", blob=bytestream)] From 56ab780f170eab1cd7f3620c33efca4571ff2819 Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Fri, 10 Apr 2026 15:08:21 +0500 Subject: [PATCH 08/10] fix(pgvector): fix lint error and override broken mixin test Two fixes: 1. Move 'import pytest' from inside assert_documents_are_equal to top of file (PLC0415 lint rule). 2. Override test_count_not_empty_async: the haystack mixin defines this method without '@staticmethod' or 'self', so pytest passes the class instance as 'document_store'. Adding a proper 'self' override makes fixture injection work correctly. --- integrations/pgvector/tests/test_document_store_async.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/integrations/pgvector/tests/test_document_store_async.py b/integrations/pgvector/tests/test_document_store_async.py index 27e088109c..c374ce7682 100644 --- a/integrations/pgvector/tests/test_document_store_async.py +++ b/integrations/pgvector/tests/test_document_store_async.py @@ -52,8 +52,6 @@ def assert_documents_are_equal(received: list[Document], expected: list[Document Embeddings lose float32 precision when round-tripped through pgvector, so we compare them approximately and then do an exact equality check on the rest. """ - import pytest - assert len(received) == len(expected) received.sort(key=lambda x: x.id) expected.sort(key=lambda x: x.id) @@ -65,6 +63,13 @@ def assert_documents_are_equal(received: list[Document], expected: list[Document received_doc.embedding, expected_doc.embedding = None, None assert received_doc == expected_doc + async def test_count_not_empty_async(self, document_store: PgvectorDocumentStore): + """Override: mixin method is missing 'self', causing fixture injection to fail.""" + await document_store.write_documents_async( + [Document(content="test doc 1"), Document(content="test doc 2"), Document(content="test doc 3")] + ) + assert await document_store.count_documents_async() == 3 + async def test_write_blob(self, document_store: PgvectorDocumentStore): bytestream = ByteStream(b"test", meta={"meta_key": "meta_value"}, mime_type="mime_type") docs = [Document(id="1", blob=bytestream)] From 57ebe2ec716a23cadeea9325308266093efd94b1 Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Fri, 10 Apr 2026 15:13:40 +0500 Subject: [PATCH 09/10] fix(pgvector): override test_write_documents_async with pgvector behaviour MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mixin raises NotImplementedError by design — each store must declare its own default duplicate policy. pgvector raises DuplicateDocumentError on a second write without an explicit policy, matching the sync test_write_documents override. --- .../pgvector/tests/test_document_store_async.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/integrations/pgvector/tests/test_document_store_async.py b/integrations/pgvector/tests/test_document_store_async.py index c374ce7682..bd8896a798 100644 --- a/integrations/pgvector/tests/test_document_store_async.py +++ b/integrations/pgvector/tests/test_document_store_async.py @@ -7,7 +7,8 @@ import psycopg import pytest from haystack.dataclasses.document import ByteStream, Document -from haystack.document_stores.errors import DocumentStoreError +from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError +from haystack.document_stores.types import DuplicatePolicy from haystack.testing.document_store import ( CountDocumentsByFilterAsyncTest, CountUniqueMetadataByFilterAsyncTest, @@ -63,6 +64,13 @@ def assert_documents_are_equal(received: list[Document], expected: list[Document received_doc.embedding, expected_doc.embedding = None, None assert received_doc == expected_doc + async def test_write_documents_async(self, document_store: PgvectorDocumentStore): + """pgvector default policy raises DuplicateDocumentError on duplicate writes.""" + docs = [Document(id="1")] + assert await document_store.write_documents_async(docs) == 1 + with pytest.raises(DuplicateDocumentError): + await document_store.write_documents_async(docs, DuplicatePolicy.FAIL) + async def test_count_not_empty_async(self, document_store: PgvectorDocumentStore): """Override: mixin method is missing 'self', causing fixture injection to fail.""" await document_store.write_documents_async( From 7eb8176422f7a6e9ce8ae4683dc806ffb6f26061 Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Fri, 10 Apr 2026 15:23:28 +0500 Subject: [PATCH 10/10] fix(pgvector): bump minimum haystack-ai to 2.27.0 CountDocumentsByFilterAsyncTest, CountUniqueMetadataByFilterAsyncTest, and UpdateByFilterAsyncTest were added to haystack.testing.document_store in 2.27.0. The lowest-direct-deps CI run was resolving to 2.26.1 and failing with ImportError. --- integrations/pgvector/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/pgvector/pyproject.toml b/integrations/pgvector/pyproject.toml index a011115755..299672cee8 100644 --- a/integrations/pgvector/pyproject.toml +++ b/integrations/pgvector/pyproject.toml @@ -24,7 +24,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "haystack-ai>=2.26.1", + "haystack-ai>=2.27.0", "pgvector>=0.3.0", "psycopg[binary]" ]