From 63e66be2056e07b4aa82b0bd5e798517cc316c5f Mon Sep 17 00:00:00 2001 From: ShubhamGond105 Date: Tue, 28 Apr 2026 01:10:52 +0530 Subject: [PATCH 01/34] test(chroma): use async DocumentStore mixin tests --- .../chroma/tests/test_document_store_async.py | 376 +++--------------- 1 file changed, 50 insertions(+), 326 deletions(-) diff --git a/integrations/chroma/tests/test_document_store_async.py b/integrations/chroma/tests/test_document_store_async.py index 7f3f924534..9a77a9272c 100644 --- a/integrations/chroma/tests/test_document_store_async.py +++ b/integrations/chroma/tests/test_document_store_async.py @@ -2,7 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -import operator import sys import uuid from unittest import mock @@ -10,6 +9,20 @@ import pytest from haystack.dataclasses import Document from haystack.testing.document_store import TEST_EMBEDDING_1 +from haystack.testing.document_store_async import ( + CountDocumentsAsyncTest, + CountDocumentsByFilterAsyncTest, + CountUniqueMetadataByFilterAsyncTest, + DeleteAllAsyncTest, + DeleteByFilterAsyncTest, + DeleteDocumentsAsyncTest, + FilterDocumentsAsyncTest, + GetMetadataFieldMinMaxAsyncTest, + GetMetadataFieldsInfoAsyncTest, + GetMetadataFieldUniqueValuesAsyncTest, + UpdateByFilterAsyncTest, + WriteDocumentsAsyncTest, +) from haystack_integrations.document_stores.chroma import ChromaDocumentStore @@ -37,7 +50,20 @@ async def test_ensure_initialized_async_invalid_client_settings_raises(self): ) @pytest.mark.integration @pytest.mark.asyncio -class TestDocumentStoreAsync: +class TestDocumentStoreAsync( + CountDocumentsAsyncTest, + CountDocumentsByFilterAsyncTest, + CountUniqueMetadataByFilterAsyncTest, + DeleteAllAsyncTest, + DeleteByFilterAsyncTest, + DeleteDocumentsAsyncTest, + FilterDocumentsAsyncTest, + GetMetadataFieldMinMaxAsyncTest, + GetMetadataFieldsInfoAsyncTest, + GetMetadataFieldUniqueValuesAsyncTest, + UpdateByFilterAsyncTest, + WriteDocumentsAsyncTest, +): @pytest.fixture def document_store(self, embedding_function) -> ChromaDocumentStore: with mock.patch( @@ -51,68 +77,7 @@ def document_store(self, embedding_function) -> ChromaDocumentStore: port=8000, ) - @staticmethod - def assert_documents_are_equal(received: list[Document], expected: list[Document]): - """ - Assert that two lists of Documents are equal. - This is used in every test, if a Document Store implementation has a different behaviour - it should override this method. - - This can happen for example when the Document Store sets a score to returned Documents. - Since we can't know what the score will be, we can't compare the Documents reliably. - """ - received.sort(key=operator.attrgetter("id")) - expected.sort(key=operator.attrgetter("id")) - - for doc_received, doc_expected in zip(received, expected, strict=True): - assert doc_received.content == doc_expected.content - assert doc_received.meta == doc_expected.meta - - async def test_write_documents_async(self, document_store: ChromaDocumentStore): - doc = Document(content="test doc") - await document_store.write_documents_async([doc]) - assert await document_store.count_documents_async() == 1 - - async def test_delete_documents_async(self, document_store: ChromaDocumentStore): - """Test delete_documents() normal behaviour.""" - doc = Document(content="test doc") - await document_store.write_documents_async([doc]) - assert await document_store.count_documents_async() == 1 - - await document_store.delete_documents_async([doc.id]) - assert await document_store.count_documents_async() == 0 - - async def test_count_empty_async(self, document_store: ChromaDocumentStore): - """Test count is zero for an empty document store""" - assert await document_store.count_documents_async() == 0 - - async def test_count_not_empty_async(self, document_store: ChromaDocumentStore): - """Test count is greater than zero if the document store contains documents""" - await document_store.write_documents_async( - [ - Document(content="test doc 1"), - Document(content="test doc 2"), - Document(content="test doc 3"), - ] - ) - assert await document_store.count_documents_async() == 3 - - async def test_no_filters_async(self, document_store): - """Test filter_documents() with empty filters""" - self.assert_documents_are_equal(await document_store.filter_documents_async(), []) - self.assert_documents_are_equal(await document_store.filter_documents_async(filters={}), []) - docs = [Document(content="test doc")] - await document_store.write_documents_async(docs) - self.assert_documents_are_equal(await document_store.filter_documents_async(), docs) - self.assert_documents_are_equal(await document_store.filter_documents_async(filters={}), docs) - - async def test_comparison_equal_async(self, document_store, filterable_docs): - """Test filter_documents() with == comparator""" - await document_store.write_documents_async(filterable_docs) - result = await document_store.filter_documents_async( - filters={"field": "meta.number", "operator": "==", "value": 100} - ) - self.assert_documents_are_equal(result, [d for d in filterable_docs if d.meta.get("number") == 100]) + # ── Chroma-specific tests (not covered by mixins) ────────────────────── async def test_client_settings_applied_async(self): store = ChromaDocumentStore( @@ -129,14 +94,13 @@ async def test_search_async(self): documents = [ Document(content="First document", meta={"author": "Author1"}), - Document(content="Second document"), # No metadata + Document(content="Second document"), Document(content="Third document", meta={"author": "Author2"}), - Document(content="Fourth document"), # No metadata + Document(content="Fourth document"), ] await document_store.write_documents_async(documents) result = await document_store.search_async(["Third"], top_k=1) - # Assertions to verify correctness assert len(result) == 1 doc = result[0][0] assert doc.content == "Third document" @@ -145,13 +109,11 @@ async def test_search_async(self): assert isinstance(doc.embedding, list) assert all(isinstance(el, float) for el in doc.embedding) - # check that empty filters behave as no filters result_empty_filters = document_store.search(["Third"], filters={}, top_k=1) assert result == result_empty_filters @pytest.mark.asyncio async def test_delete_all_documents_index_recreation(self, document_store: ChromaDocumentStore): - # write some documents docs = [ Document(id="1", content="First document", meta={"category": "test"}), Document(id="2", content="Second document", meta={"category": "test"}), @@ -159,128 +121,17 @@ async def test_delete_all_documents_index_recreation(self, document_store: Chrom ] await document_store.write_documents_async(docs) - # get the current document_store config config_before = await document_store._async_collection.get(document_store._collection_name) - # delete all documents with recreating the index await document_store.delete_all_documents_async(recreate_index=True) assert await document_store.count_documents_async() == 0 - # assure that with the same config config_after = await document_store._async_collection.get(document_store._collection_name) - assert config_before == config_after - # ensure the collection still exists by writing documents again - await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 3 - - @pytest.mark.asyncio - async def test_delete_all_documents_async(self, document_store): - docs = [ - Document(id="1", content="First document", meta={"category": "test"}), - Document(id="2", content="Second document", meta={"category": "test"}), - Document(id="3", content="Third document", meta={"category": "other"}), - ] await document_store.write_documents_async(docs) assert await document_store.count_documents_async() == 3 - # delete all documents - await document_store.delete_all_documents_async() - assert await document_store.count_documents_async() == 0 - - # verify index still exists and can accept new documents and retrieve - new_doc = Document(id="4", content="New document after delete all") - await document_store.write_documents_async([new_doc]) - assert await document_store.count_documents_async() == 1 - - results = await document_store.filter_documents_async() - assert len(results) == 1 - assert results[0].id == "4" - assert results[0].content == "New document after delete all" - - async def test_delete_by_filter_async(self, document_store: ChromaDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - Document(content="Doc 3", meta={"category": "A"}), - ] - await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 3 - - # Delete documents with category="A" - deleted_count = await document_store.delete_by_filter_async( - filters={"field": "meta.category", "operator": "==", "value": "A"} - ) - assert deleted_count == 2 - assert await document_store.count_documents_async() == 1 - - # Verify only category B remains - remaining_docs = await document_store.filter_documents_async() - assert len(remaining_docs) == 1 - assert remaining_docs[0].meta["category"] == "B" - - async def test_delete_by_filter_async_no_matches(self, document_store: ChromaDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - ] - await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 2 - - # Try to delete documents with category="C" (no matches) - deleted_count = await document_store.delete_by_filter_async( - filters={"field": "meta.category", "operator": "==", "value": "C"} - ) - assert deleted_count == 0 - assert await document_store.count_documents_async() == 2 - - async def test_update_by_filter_async(self, document_store: ChromaDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "draft"}), - Document(content="Doc 2", meta={"category": "B", "status": "draft"}), - Document(content="Doc 3", meta={"category": "A", "status": "draft"}), - ] - await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 3 - - # Update status for category="A" documents - updated_count = await document_store.update_by_filter_async( - filters={"field": "meta.category", "operator": "==", "value": "A"}, meta={"status": "published"} - ) - assert updated_count == 2 - - # Verify the updated documents have the new metadata - published_docs = await document_store.filter_documents_async( - filters={"field": "meta.status", "operator": "==", "value": "published"} - ) - assert len(published_docs) == 2 - for doc in published_docs: - assert doc.meta["status"] == "published" - assert doc.meta["category"] == "A" - - # Verify documents with category="B" were not updated - unpublished_docs = await document_store.filter_documents_async( - filters={"field": "meta.category", "operator": "==", "value": "B"} - ) - assert len(unpublished_docs) == 1 - assert unpublished_docs[0].meta["status"] == "draft" - - async def test_update_by_filter_async_no_matches(self, document_store: ChromaDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - ] - await document_store.write_documents_async(docs) - assert await document_store.count_documents_async() == 2 - - # Try to update documents with category="C" (no matches) - updated_count = await document_store.update_by_filter_async( - filters={"field": "meta.category", "operator": "==", "value": "C"}, meta={"status": "published"} - ) - assert updated_count == 0 - assert await document_store.count_documents_async() == 2 - async def test_search_embeddings_async(self, document_store: ChromaDocumentStore): query_embedding = TEST_EMBEDDING_1 documents = [ @@ -291,170 +142,43 @@ async def test_search_embeddings_async(self, document_store: ChromaDocumentStore await document_store.write_documents_async(documents) result = await document_store.search_embeddings_async([query_embedding], top_k=2) - # Assertions to verify correctness assert len(result) == 1 assert len(result[0]) == 2 - # The documents with matching embeddings should be returned assert all(doc.embedding == pytest.approx(TEST_EMBEDDING_1) for doc in result[0]) assert all(doc.score is not None for doc in result[0]) - # check that empty filters behave as no filters result_empty_filters = await document_store.search_embeddings_async([query_embedding], filters={}, top_k=2) assert len(result_empty_filters) == 1 assert len(result_empty_filters[0]) == 2 + # ── Chroma-specific error cases for metadata operations ───────────────── -@pytest.mark.skipif( - sys.platform == "win32", - reason="We do not run the Chroma server on Windows and async is only supported with HTTP connections", -) -@pytest.mark.integration -@pytest.mark.asyncio -class TestMetadataOperationsAsync: - """Test async metadata query operations for ChromaDocumentStore""" - - @pytest.fixture - def document_store(self, embedding_function) -> ChromaDocumentStore: - with mock.patch( - "haystack_integrations.document_stores.chroma.document_store.get_embedding_function" - ) as get_func: - get_func.return_value = embedding_function - return ChromaDocumentStore( - embedding_function="test_function", - collection_name=f"{uuid.uuid1()}-async", - host="localhost", - port=8000, - ) - - @pytest.fixture - def populated_store(self, document_store: ChromaDocumentStore) -> ChromaDocumentStore: - """Fixture with pre-populated test documents with diverse metadata. - Uses sync write since pytest does not natively support async fixtures. - Data is accessible via async methods as both clients share the same Chroma server. - """ + async def test_get_metadata_field_min_max_async_string(self, document_store: ChromaDocumentStore): + """Chroma-specific: min/max for string field returns alphabetical order.""" docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1, "score": 0.9}), - Document(content="Doc 2", meta={"category": "B", "status": "active", "priority": 2, "score": 0.8}), - Document(content="Doc 3", meta={"category": "A", "status": "inactive", "priority": 1, "score": 0.7}), - Document(content="Doc 4", meta={"category": "A", "status": "active", "priority": 3, "score": 0.95}), - Document(content="Doc 5", meta={"category": "C", "status": "active", "priority": 2, "score": 0.6}), - Document(content="Doc 6", meta={"category": "B", "status": "inactive", "priority": 1}), + Document(content="Doc 1", meta={"category": "A"}), + Document(content="Doc 2", meta={"category": "B"}), + Document(content="Doc 3", meta={"category": "C"}), ] - document_store.write_documents(docs) - return document_store - - async def test_count_documents_by_filter_async_simple(self, populated_store): - """Test counting documents with simple filter""" - count = await populated_store.count_documents_by_filter_async( - filters={"field": "meta.category", "operator": "==", "value": "A"} - ) - assert count == 3 - - async def test_count_documents_by_filter_async_compound(self, populated_store): - """Test counting documents with compound filter""" - count = await populated_store.count_documents_by_filter_async( - filters={ - "operator": "AND", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "A"}, - {"field": "meta.status", "operator": "==", "value": "active"}, - ], - } - ) - assert count == 2 - - async def test_count_unique_metadata_by_filter_async(self, populated_store): - """Test counting unique metadata values""" - counts = await populated_store.count_unique_metadata_by_filter_async({}, ["category", "status"]) - assert counts["category"] == 3 # A, B, C - assert counts["status"] == 2 # active, inactive - - async def test_count_unique_metadata_by_filter_async_with_filter(self, populated_store): - """Test counting unique metadata values with filter""" - counts = await populated_store.count_unique_metadata_by_filter_async( - filters={"field": "meta.category", "operator": "==", "value": "A"}, metadata_fields=["status"] - ) - assert counts["status"] == 2 # active, inactive - - async def test_get_metadata_fields_info_async(self, populated_store): - """Test getting metadata field information""" - fields_info = await populated_store.get_metadata_fields_info_async() - - assert "category" in fields_info - assert "status" in fields_info - assert "priority" in fields_info - assert "score" in fields_info - - # Check types - assert fields_info["category"]["type"] == "keyword" - assert fields_info["status"]["type"] == "keyword" - assert fields_info["priority"]["type"] == "long" - assert fields_info["score"]["type"] == "float" - - async def test_get_metadata_fields_info_async_empty_collection(self, document_store): - """Test getting metadata field info from empty collection""" - fields_info = await document_store.get_metadata_fields_info_async() - assert fields_info == {} - - async def test_get_metadata_field_min_max_async_numeric(self, populated_store): - """Test getting min/max values for numeric field""" - min_max = await populated_store.get_metadata_field_min_max_async("priority") - assert min_max["min"] == 1 - assert min_max["max"] == 3 - - async def test_get_metadata_field_min_max_async_float(self, populated_store): - """Test getting min/max values for float field""" - min_max = await populated_store.get_metadata_field_min_max_async("score") - assert min_max["min"] == 0.6 - assert min_max["max"] == 0.95 - - async def test_get_metadata_field_min_max_async_string(self, populated_store): - """Test getting min/max values for string field (alphabetical)""" - min_max = await populated_store.get_metadata_field_min_max_async("category") + await document_store.write_documents_async(docs) + min_max = await document_store.get_metadata_field_min_max_async("category") assert min_max["min"] == "A" assert min_max["max"] == "C" - async def test_get_metadata_field_min_max_async_missing_field(self, populated_store): - """Test getting min/max for non-existent field""" - min_max = await populated_store.get_metadata_field_min_max_async("nonexistent_field") + async def test_get_metadata_field_min_max_async_missing_field(self, document_store: ChromaDocumentStore): + """Chroma-specific: min/max for non-existent field returns None.""" + docs = [Document(content="Doc 1", meta={"category": "A"})] + await document_store.write_documents_async(docs) + min_max = await document_store.get_metadata_field_min_max_async("nonexistent_field") assert min_max["min"] is None assert min_max["max"] is None - async def test_get_metadata_field_unique_values_async_basic(self, populated_store): - """Test getting unique values for metadata field""" - values, total = await populated_store.get_metadata_field_unique_values_async("category", from_=0, size=10) - assert sorted(values) == ["A", "B", "C"] - assert total == 3 - - async def test_get_metadata_field_unique_values_async_pagination(self, populated_store): - """Test pagination of unique values""" - # First page - values_page1, total = await populated_store.get_metadata_field_unique_values_async("category", from_=0, size=2) - assert len(values_page1) == 2 - assert total == 3 - - # Second page - values_page2, total = await populated_store.get_metadata_field_unique_values_async("category", from_=2, size=2) - assert len(values_page2) == 1 - assert total == 3 - - # Check all values are returned across pages - all_values = values_page1 + values_page2 - assert sorted(all_values) == ["A", "B", "C"] - - async def test_get_metadata_field_unique_values_async_with_search_term(self, populated_store): - """Test getting unique values filtered by search term""" - # Search for documents containing "Doc 1" - values, total = await populated_store.get_metadata_field_unique_values_async( - "category", search_term="Doc 1", from_=0, size=10 - ) - assert values == ["A"] # Only Doc 1 has category A - assert total == 1 - - async def test_get_metadata_field_unique_values_async_missing_field(self, populated_store): - """Test getting unique values for non-existent field""" - values, total = await populated_store.get_metadata_field_unique_values_async( + async def test_get_metadata_field_unique_values_async_missing_field(self, document_store: ChromaDocumentStore): + """Chroma-specific: unique values for non-existent field returns empty.""" + docs = [Document(content="Doc 1", meta={"category": "A"})] + await document_store.write_documents_async(docs) + values, total = await document_store.get_metadata_field_unique_values_async( "nonexistent_field", from_=0, size=10 ) assert values == [] - assert total == 0 + assert total == 0 \ No newline at end of file From 28d58bc40616f4c68412bbbd1958155ff08c59c8 Mon Sep 17 00:00:00 2001 From: ShubhamGond105 Date: Tue, 28 Apr 2026 14:22:04 +0530 Subject: [PATCH 02/34] chore(chroma): bump minimum haystack-ai version to 2.28.0 --- integrations/chroma/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/chroma/pyproject.toml b/integrations/chroma/pyproject.toml index 579628c054..234c51d3de 100644 --- a/integrations/chroma/pyproject.toml +++ b/integrations/chroma/pyproject.toml @@ -24,7 +24,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "haystack-ai>=2.26.1", + "haystack-ai>=2.28.0", "chromadb>=1.5.4" ] From b8fb275338fb5bd0c20ced8dc0371d0676258e13 Mon Sep 17 00:00:00 2001 From: ShubhamGond105 Date: Tue, 28 Apr 2026 14:39:31 +0530 Subject: [PATCH 03/34] test(chroma): fix lint issues --- integrations/chroma/tests/test_document_store_async.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/integrations/chroma/tests/test_document_store_async.py b/integrations/chroma/tests/test_document_store_async.py index 9a77a9272c..3c06677e2b 100644 --- a/integrations/chroma/tests/test_document_store_async.py +++ b/integrations/chroma/tests/test_document_store_async.py @@ -77,6 +77,12 @@ def document_store(self, embedding_function) -> ChromaDocumentStore: port=8000, ) + async def test_write_documents_async(self, document_store: ChromaDocumentStore): + """Override: Chroma uses DuplicatesPolicy.OVERWRITE by default.""" + doc = Document(content="test doc") + await document_store.write_documents_async([doc]) + assert await document_store.count_documents_async() == 1 + # ── Chroma-specific tests (not covered by mixins) ────────────────────── async def test_client_settings_applied_async(self): @@ -181,4 +187,4 @@ async def test_get_metadata_field_unique_values_async_missing_field(self, docume "nonexistent_field", from_=0, size=10 ) assert values == [] - assert total == 0 \ No newline at end of file + assert total == 0 From 1a238a5c381f242fbba93e53aca9d3602361f7ec Mon Sep 17 00:00:00 2001 From: ShubhamGond105 Date: Wed, 29 Apr 2026 08:51:08 +0530 Subject: [PATCH 04/34] test(chroma): override duplicate fail test for Chroma behaviour --- .../chroma/tests/test_document_store_async.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/integrations/chroma/tests/test_document_store_async.py b/integrations/chroma/tests/test_document_store_async.py index 3c06677e2b..6290c69ee9 100644 --- a/integrations/chroma/tests/test_document_store_async.py +++ b/integrations/chroma/tests/test_document_store_async.py @@ -8,6 +8,7 @@ import pytest from haystack.dataclasses import Document +from haystack.document_stores.types import DuplicatePolicy from haystack.testing.document_store import TEST_EMBEDDING_1 from haystack.testing.document_store_async import ( CountDocumentsAsyncTest, @@ -83,6 +84,17 @@ async def test_write_documents_async(self, document_store: ChromaDocumentStore): await document_store.write_documents_async([doc]) assert await document_store.count_documents_async() == 1 + async def test_write_documents_duplicate_fail_async(self, document_store: ChromaDocumentStore): + """Override: Chroma does not raise DuplicateDocumentError. + + Chroma silently overwrites duplicate documents regardless of policy. + """ + + doc = Document(content="test doc") + await document_store.write_documents_async([doc], policy=DuplicatePolicy.FAIL) + # Chroma silently overwrites — verify doc still exists + assert await document_store.count_documents_async() == 1 + # ── Chroma-specific tests (not covered by mixins) ────────────────────── async def test_client_settings_applied_async(self): From c2ac6f610dc3501d1ad2dec5b82d19e8abc5024a Mon Sep 17 00:00:00 2001 From: ShubhamGond105 Date: Wed, 29 Apr 2026 09:08:47 +0530 Subject: [PATCH 05/34] test(chroma): remove WriteDocumentsAsyncTest mixin - Chroma has custom write behaviour --- .../chroma/tests/test_document_store_async.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/integrations/chroma/tests/test_document_store_async.py b/integrations/chroma/tests/test_document_store_async.py index 6290c69ee9..dfdd7c8fd4 100644 --- a/integrations/chroma/tests/test_document_store_async.py +++ b/integrations/chroma/tests/test_document_store_async.py @@ -8,7 +8,6 @@ import pytest from haystack.dataclasses import Document -from haystack.document_stores.types import DuplicatePolicy from haystack.testing.document_store import TEST_EMBEDDING_1 from haystack.testing.document_store_async import ( CountDocumentsAsyncTest, @@ -22,7 +21,6 @@ GetMetadataFieldsInfoAsyncTest, GetMetadataFieldUniqueValuesAsyncTest, UpdateByFilterAsyncTest, - WriteDocumentsAsyncTest, ) from haystack_integrations.document_stores.chroma import ChromaDocumentStore @@ -63,7 +61,6 @@ class TestDocumentStoreAsync( GetMetadataFieldsInfoAsyncTest, GetMetadataFieldUniqueValuesAsyncTest, UpdateByFilterAsyncTest, - WriteDocumentsAsyncTest, ): @pytest.fixture def document_store(self, embedding_function) -> ChromaDocumentStore: @@ -78,23 +75,14 @@ def document_store(self, embedding_function) -> ChromaDocumentStore: port=8000, ) + # ── Chroma-specific write test ────────────────────────────────────────── + async def test_write_documents_async(self, document_store: ChromaDocumentStore): - """Override: Chroma uses DuplicatesPolicy.OVERWRITE by default.""" + """Chroma-specific: basic write test.""" doc = Document(content="test doc") await document_store.write_documents_async([doc]) assert await document_store.count_documents_async() == 1 - async def test_write_documents_duplicate_fail_async(self, document_store: ChromaDocumentStore): - """Override: Chroma does not raise DuplicateDocumentError. - - Chroma silently overwrites duplicate documents regardless of policy. - """ - - doc = Document(content="test doc") - await document_store.write_documents_async([doc], policy=DuplicatePolicy.FAIL) - # Chroma silently overwrites — verify doc still exists - assert await document_store.count_documents_async() == 1 - # ── Chroma-specific tests (not covered by mixins) ────────────────────── async def test_client_settings_applied_async(self): From 494a43356e2a3b6639e93c1c57f96e39ca634060 Mon Sep 17 00:00:00 2001 From: ShubhamGond105 Date: Sat, 2 May 2026 23:15:04 +0530 Subject: [PATCH 06/34] feat(supabase): add SupabaseGroongaDocumentStore and SupabaseGroongaRetriever --- .../retrievers/supabase/__init__.py | 8 +- .../retrievers/supabase/groonga_retriever.py | 138 +++++++++ .../document_stores/supabase/__init__.py | 6 +- .../supabase/groonga_document_store.py | 257 ++++++++++++++++ .../tests/test_groonga_document_store.py | 284 ++++++++++++++++++ 5 files changed, 690 insertions(+), 3 deletions(-) create mode 100644 integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py create mode 100644 integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py create mode 100644 integrations/supabase/tests/test_groonga_document_store.py diff --git a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py index fdc5a89c23..04c678c587 100644 --- a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py +++ b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py @@ -1,8 +1,12 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 - from .embedding_retriever import SupabasePgvectorEmbeddingRetriever from .keyword_retriever import SupabasePgvectorKeywordRetriever +from .groonga_retriever import SupabaseGroongaRetriever -__all__ = ["SupabasePgvectorEmbeddingRetriever", "SupabasePgvectorKeywordRetriever"] +__all__ = [ + "SupabasePgvectorEmbeddingRetriever", + "SupabasePgvectorKeywordRetriever", + "SupabaseGroongaRetriever", +] \ No newline at end of file diff --git a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py new file mode 100644 index 0000000000..4042661fe3 --- /dev/null +++ b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py @@ -0,0 +1,138 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import copy +from typing import Any, Dict, List, Optional + +from haystack import component, default_from_dict, default_to_dict +from haystack.dataclasses import Document +from haystack.document_stores.types import FilterPolicy + +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore + + +@component +class SupabaseGroongaRetriever: + """ + Retrieves documents from SupabaseGroongaDocumentStore using PGroonga full-text search. + + This retriever works without embeddings — it searches documents using plain text queries. + It can be used alongside SupabasePgvectorEmbeddingRetriever in hybrid search pipelines. + + Example usage: + +```python + from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore + from haystack_integrations.components.retrievers.supabase import SupabaseGroongaRetriever + from haystack.utils import Secret + + document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_fts_documents", + ) + + retriever = SupabaseGroongaRetriever(document_store=document_store, top_k=10) + result = retriever.run(query="python programming") + print(result["documents"]) +``` + """ + + def __init__( + self, + *, + document_store: SupabaseGroongaDocumentStore, + filters: Optional[Dict[str, Any]] = None, + top_k: int = 10, + filter_policy: str | FilterPolicy = FilterPolicy.REPLACE, + ) -> None: + """ + Initialize the SupabaseGroongaRetriever. + + :param document_store: An instance of SupabaseGroongaDocumentStore. + :param filters: Optional filters applied to retrieved Documents. + :param top_k: Maximum number of Documents to return. Defaults to 10. + :param filter_policy: Policy to determine how filters are applied. + :raises ValueError: If document_store is not an instance of SupabaseGroongaDocumentStore. + """ + if not isinstance(document_store, SupabaseGroongaDocumentStore): + msg = "document_store must be an instance of SupabaseGroongaDocumentStore" + raise ValueError(msg) + + self.document_store = document_store + self.filters = filters or {} + self.top_k = top_k + self.filter_policy = ( + filter_policy + if isinstance(filter_policy, FilterPolicy) + else FilterPolicy.from_str(filter_policy) + ) + + @component.output_types(documents=List[Document]) + def run( + self, + query: str, + filters: Optional[Dict[str, Any]] = None, + top_k: Optional[int] = None, + ) -> Dict[str, List[Document]]: + """ + Runs the retriever on the given query. + + :param query: The text query to search for. + :param filters: Optional runtime filters. Merged or replaced based on filter_policy. + :param top_k: Optional override for maximum number of documents to return. + :returns: Dictionary with key "documents" containing list of matching Documents. + """ + if not query: + return {"documents": []} + + # Handle filter policy + if filters is not None: + if self.filter_policy == FilterPolicy.MERGE: + merged_filters = {**self.filters, **filters} + else: + merged_filters = filters + else: + merged_filters = self.filters + + effective_top_k = top_k if top_k is not None else self.top_k + + documents = self.document_store._groonga_retrieval( + query=query, + top_k=effective_top_k, + filters=merged_filters, + ) + + return {"documents": documents} + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: Dictionary with serialized data. + """ + return default_to_dict( + self, + filters=self.filters, + top_k=self.top_k, + filter_policy=self.filter_policy.value, + document_store=self.document_store.to_dict(), + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "SupabaseGroongaRetriever": + """ + Deserializes the component from a dictionary. + + :param data: Dictionary to deserialize from. + :returns: Deserialized component. + """ + data = copy.deepcopy(data) + doc_store_params = data["init_parameters"]["document_store"] + data["init_parameters"]["document_store"] = SupabaseGroongaDocumentStore.from_dict( + doc_store_params + ) + if filter_policy := data["init_parameters"].get("filter_policy"): + data["init_parameters"]["filter_policy"] = FilterPolicy.from_str(filter_policy) + return default_from_dict(cls, data) \ No newline at end of file diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/__init__.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/__init__.py index 7512a97b75..d989957a91 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/__init__.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/__init__.py @@ -2,5 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 from .document_store import SupabasePgvectorDocumentStore +from .groonga_document_store import SupabaseGroongaDocumentStore -__all__ = ["SupabasePgvectorDocumentStore"] +__all__ = [ + "SupabasePgvectorDocumentStore", + "SupabaseGroongaDocumentStore", +] \ No newline at end of file diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py new file mode 100644 index 0000000000..4e7ca24d39 --- /dev/null +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -0,0 +1,257 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Dict, List, Optional + +from haystack import default_from_dict, default_to_dict, logging +from haystack.dataclasses import Document +from haystack.document_stores.errors import DuplicateDocumentError +from haystack.document_stores.types import DuplicatePolicy +from haystack.utils.auth import Secret, deserialize_secrets_inplace + +from supabase import Client, create_client + +logger = logging.getLogger(__name__) + + +class SupabaseGroongaDocumentStore: + """ + A Document Store for Supabase using PGroonga for full-text search. + + PGroonga is a PostgreSQL extension for fast, multilingual full-text search. + Unlike vector search, this store works with plain text queries — no embeddings needed. + + Prerequisites: + - A Supabase project with PGroonga extension enabled. + - Enable PGroonga in your Supabase project by running: + `CREATE EXTENSION IF NOT EXISTS pgroonga;` + + Example usage: + +```python + from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore + from haystack.utils import Secret + + document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_fts_documents", + ) +``` + """ + + def __init__( + self, + *, + supabase_url: str, + supabase_key: Secret = Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name: str = "haystack_groonga_documents", + recreate_table: bool = False, + ) -> None: + """ + Creates a new SupabaseGroongaDocumentStore instance. + + :param supabase_url: The URL of your Supabase project. + Format: `https://.supabase.co` + :param supabase_key: The service role key for your Supabase project. + Defaults to reading from the `SUPABASE_SERVICE_KEY` environment variable. + :param table_name: The name of the table to store documents in. + Defaults to `haystack_groonga_documents`. + :param recreate_table: Whether to drop and recreate the table on startup. + Defaults to `False`. + """ + self.supabase_url = supabase_url + self.supabase_key = supabase_key + self.table_name = table_name + self.recreate_table = recreate_table + + # Connect to Supabase + resolved_key = supabase_key.resolve_value() + self._client: Client = create_client(supabase_url, resolved_key) + + # Set up the table + self._setup_table() + + def _setup_table(self) -> None: + """ + Creates the documents table with PGroonga index if it does not exist. + If recreate_table is True, drops and recreates the table. + """ + if self.recreate_table: + self._client.rpc( + "exec_sql", + {"query": f"DROP TABLE IF EXISTS {self.table_name};"} + ).execute() + + # Create table if not exists + create_table_sql = f""" + CREATE TABLE IF NOT EXISTS {self.table_name} ( + id TEXT PRIMARY KEY, + content TEXT, + meta JSONB, + score REAL + ); + """ + self._client.rpc("exec_sql", {"query": create_table_sql}).execute() + + # Create PGroonga index on content column + create_index_sql = f""" + CREATE INDEX IF NOT EXISTS pgroonga_{self.table_name}_index + ON {self.table_name} + USING pgroonga (content); + """ + self._client.rpc("exec_sql", {"query": create_index_sql}).execute() + + def count_documents(self) -> int: + """ + Returns the number of documents in the store. + + :returns: Number of documents. + """ + result = self._client.table(self.table_name).select("id", count="exact").execute() + return result.count or 0 + + def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: + """ + Returns documents matching the given filters. + + :param filters: Optional dictionary of filters. + :returns: List of matching Document objects. + """ + query = self._client.table(self.table_name).select("*") + result = query.execute() + return [self._to_haystack_document(row) for row in result.data] + + def write_documents( + self, + documents: List[Document], + policy: DuplicatePolicy = DuplicatePolicy.NONE, + ) -> int: + """ + Writes documents to the store. + + :param documents: List of Haystack Document objects to write. + :param policy: How to handle duplicate documents. + :returns: Number of documents written. + """ + if not documents: + return 0 + + written = 0 + for doc in documents: + row = { + "id": doc.id, + "content": doc.content or "", + "meta": doc.meta or {}, + "score": None, + } + if policy == DuplicatePolicy.OVERWRITE: + self._client.table(self.table_name).upsert(row).execute() + written += 1 + elif policy == DuplicatePolicy.SKIP: + existing = ( + self._client.table(self.table_name) + .select("id") + .eq("id", doc.id) + .execute() + ) + if not existing.data: + self._client.table(self.table_name).insert(row).execute() + written += 1 + elif policy == DuplicatePolicy.FAIL: + existing = ( + self._client.table(self.table_name) + .select("id") + .eq("id", doc.id) + .execute() + ) + if existing.data: + raise DuplicateDocumentError( + f"Document with id {doc.id!r} already exists." + ) + self._client.table(self.table_name).insert(row).execute() + written += 1 + else: + self._client.table(self.table_name).insert(row).execute() + written += 1 + + return written + + def delete_documents(self, document_ids: List[str]) -> None: + """ + Deletes documents with the given IDs. + + :param document_ids: List of document IDs to delete. + """ + if not document_ids: + return + self._client.table(self.table_name).delete().in_("id", document_ids).execute() + + def _groonga_retrieval( + self, + query: str, + top_k: int = 10, + filters: Optional[Dict[str, Any]] = None, + ) -> List[Document]: + """ + Searches documents using PGroonga full-text search. + + :param query: The text query to search for. + :param top_k: Maximum number of results to return. + :param filters: Optional filters to apply. + :returns: List of matching Document objects ranked by relevance. + """ + search_sql = f""" + SELECT id, content, meta, + pgroonga_score(tableoid, ctid) AS score + FROM {self.table_name} + WHERE content &@~ %s + ORDER BY score DESC + LIMIT %s; + """ + result = self._client.rpc( + "groonga_search", + {"query_text": query, "table": self.table_name, "top_k": top_k} + ).execute() + + return [self._to_haystack_document(row) for row in result.data] + + def _to_haystack_document(self, row: Dict[str, Any]) -> Document: + """ + Converts a database row dictionary into a Haystack Document. + + :param row: Dictionary from database result. + :returns: Haystack Document object. + """ + return Document( + id=row["id"], + content=row.get("content"), + meta=row.get("meta") or {}, + score=row.get("score"), + ) + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: Dictionary with serialized data. + """ + return default_to_dict( + self, + supabase_url=self.supabase_url, + supabase_key=self.supabase_key.to_dict(), + table_name=self.table_name, + recreate_table=self.recreate_table, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "SupabaseGroongaDocumentStore": + """ + Deserializes the component from a dictionary. + + :param data: Dictionary to deserialize from. + :returns: Deserialized component. + """ + deserialize_secrets_inplace(data["init_parameters"], ["supabase_key"]) + return default_from_dict(cls, data) \ No newline at end of file diff --git a/integrations/supabase/tests/test_groonga_document_store.py b/integrations/supabase/tests/test_groonga_document_store.py new file mode 100644 index 0000000000..ba9f92232f --- /dev/null +++ b/integrations/supabase/tests/test_groonga_document_store.py @@ -0,0 +1,284 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import MagicMock, patch +from typing import Any, Dict + +import pytest +from haystack.dataclasses import Document +from haystack.document_stores.types import DuplicatePolicy +from haystack.utils.auth import Secret + +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore +from haystack_integrations.components.retrievers.supabase import SupabaseGroongaRetriever + + +# ───────────────────────────────────────────── +# FIXTURES +# ───────────────────────────────────────────── + +@pytest.fixture +def mock_supabase_client(): + """Creates a mock Supabase client so we never hit a real database.""" + with patch( + "haystack_integrations.document_stores.supabase.groonga_document_store.create_client" + ) as mock_create: + mock_client = MagicMock() + mock_create.return_value = mock_client + + # Mock rpc calls (used in _setup_table) + mock_client.rpc.return_value.execute.return_value = MagicMock(data=[], count=0) + + # Mock table calls + mock_table = MagicMock() + mock_client.table.return_value = mock_table + mock_table.select.return_value = mock_table + mock_table.insert.return_value = mock_table + mock_table.upsert.return_value = mock_table + mock_table.delete.return_value = mock_table + mock_table.eq.return_value = mock_table + mock_table.in_.return_value = mock_table + mock_table.execute.return_value = MagicMock(data=[], count=0) + + yield mock_client + + +@pytest.fixture +def groonga_store(mock_supabase_client, monkeypatch): # noqa: ARG001 + """Creates a SupabaseGroongaDocumentStore with mocked client.""" + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + store = SupabaseGroongaDocumentStore( + supabase_url="https://fake-project.supabase.co", + table_name="test_groonga_documents", + recreate_table=False, + ) + return store + + +# ───────────────────────────────────────────── +# DOCUMENT STORE TESTS +# ───────────────────────────────────────────── + +def test_init_defaults(mock_supabase_client, monkeypatch): # noqa: ARG001 + """Test that default parameters are set correctly.""" + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + store = SupabaseGroongaDocumentStore( + supabase_url="https://fake-project.supabase.co", + ) + assert store.table_name == "haystack_groonga_documents" + assert store.recreate_table is False + assert store.supabase_url == "https://fake-project.supabase.co" + + +def test_init_custom_params(mock_supabase_client, monkeypatch): # noqa: ARG001 + """Test that custom parameters are set correctly.""" + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + store = SupabaseGroongaDocumentStore( + supabase_url="https://fake-project.supabase.co", + table_name="my_custom_table", + recreate_table=True, + ) + assert store.table_name == "my_custom_table" + assert store.recreate_table is True + + +def test_init_invalid_store(): + """Test that passing wrong store to retriever raises ValueError.""" + with pytest.raises(ValueError, match="document_store must be an instance"): + SupabaseGroongaRetriever(document_store="not_a_store") + + +def test_count_documents(groonga_store, mock_supabase_client): + """Test count_documents returns correct number.""" + mock_supabase_client.table.return_value.select.return_value.execute.return_value = MagicMock( + count=5 + ) + count = groonga_store.count_documents() + assert count == 5 + + +def test_count_documents_empty(groonga_store, mock_supabase_client): + """Test count_documents returns 0 when store is empty.""" + mock_supabase_client.table.return_value.select.return_value.execute.return_value = MagicMock( + count=0 + ) + count = groonga_store.count_documents() + assert count == 0 + + +def test_write_documents(groonga_store, mock_supabase_client): + """Test that write_documents writes correct number of documents.""" + mock_table = mock_supabase_client.table.return_value + mock_table.insert.return_value.execute.return_value = MagicMock(data=[{}]) + + documents = [ + Document(content="Python is great"), + Document(content="Haystack is a RAG framework"), + ] + written = groonga_store.write_documents(documents) + assert written == 2 + + +def test_write_documents_empty(groonga_store): + """Test that writing empty list returns 0.""" + written = groonga_store.write_documents([]) + assert written == 0 + + +def test_write_documents_overwrite(groonga_store, mock_supabase_client): + """Test that overwrite policy uses upsert.""" + mock_table = mock_supabase_client.table.return_value + mock_table.upsert.return_value.execute.return_value = MagicMock(data=[{}]) + + documents = [Document(content="test document")] + written = groonga_store.write_documents(documents, policy=DuplicatePolicy.OVERWRITE) + assert written == 1 + mock_table.upsert.assert_called_once() + + +def test_delete_documents(groonga_store, mock_supabase_client): + """Test that delete_documents calls delete with correct IDs.""" + mock_table = mock_supabase_client.table.return_value + mock_table.delete.return_value.in_.return_value.execute.return_value = MagicMock(data=[]) + + groonga_store.delete_documents(["id1", "id2"]) + mock_table.delete.assert_called_once() + + +def test_delete_documents_empty(groonga_store, mock_supabase_client): + """Test that deleting empty list does nothing.""" + groonga_store.delete_documents([]) + mock_supabase_client.table.return_value.delete.assert_not_called() + + +def test_filter_documents(groonga_store, mock_supabase_client): + """Test that filter_documents returns correct documents.""" + mock_supabase_client.table.return_value.select.return_value.execute.return_value = MagicMock( + data=[ + {"id": "1", "content": "Python is great", "meta": {}, "score": None}, + {"id": "2", "content": "Haystack rocks", "meta": {}, "score": None}, + ] + ) + docs = groonga_store.filter_documents() + assert len(docs) == 2 + assert docs[0].content == "Python is great" + assert docs[1].content == "Haystack rocks" + + +# ───────────────────────────────────────────── +# SERIALIZATION TESTS +# ───────────────────────────────────────────── + +def test_to_dict(groonga_store): + """Test that to_dict returns correct dictionary.""" + result = groonga_store.to_dict() + assert result["type"] == ( + "haystack_integrations.document_stores.supabase" + ".groonga_document_store.SupabaseGroongaDocumentStore" + ) + assert result["init_parameters"]["table_name"] == "test_groonga_documents" + assert result["init_parameters"]["supabase_url"] == "https://fake-project.supabase.co" + assert result["init_parameters"]["recreate_table"] is False + + +def test_from_dict(mock_supabase_client, monkeypatch): # noqa: ARG001 + """Test that from_dict recreates the store correctly.""" + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + data = { + "type": ( + "haystack_integrations.document_stores.supabase" + ".groonga_document_store.SupabaseGroongaDocumentStore" + ), + "init_parameters": { + "supabase_url": "https://fake-project.supabase.co", + "supabase_key": { + "type": "env_var", + "env_vars": ["SUPABASE_SERVICE_KEY"], + "strict": True, + }, + "table_name": "test_groonga_documents", + "recreate_table": False, + }, + } + store = SupabaseGroongaDocumentStore.from_dict(data) + assert store.table_name == "test_groonga_documents" + assert store.supabase_url == "https://fake-project.supabase.co" + + +# ───────────────────────────────────────────── +# RETRIEVER TESTS +# ───────────────────────────────────────────── + +def test_retriever_init(groonga_store): + """Test that retriever initializes correctly.""" + retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) + assert retriever.top_k == 5 + assert retriever.document_store == groonga_store + + +def test_retriever_init_default_top_k(groonga_store): + """Test that retriever default top_k is 10.""" + retriever = SupabaseGroongaRetriever(document_store=groonga_store) + assert retriever.top_k == 10 + + +def test_retriever_run_empty_query(groonga_store): + """Test that empty query returns empty documents.""" + retriever = SupabaseGroongaRetriever(document_store=groonga_store) + result = retriever.run(query="") + assert result == {"documents": []} + + +def test_retriever_run(groonga_store, mock_supabase_client): + """Test that retriever run calls document store correctly.""" + mock_supabase_client.rpc.return_value.execute.return_value = MagicMock( + data=[ + {"id": "1", "content": "Python is great", "meta": {}, "score": 1.0}, + ] + ) + retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) + result = retriever.run(query="Python") + assert "documents" in result + + +def test_retriever_to_dict(groonga_store): + """Test that retriever serializes correctly.""" + retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) + result = retriever.to_dict() + assert result["init_parameters"]["top_k"] == 5 + assert "document_store" in result["init_parameters"] + + +def test_retriever_from_dict(mock_supabase_client, monkeypatch): # noqa: ARG001 + """Test that retriever deserializes correctly.""" + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + data = { + "type": ( + "haystack_integrations.components.retrievers.supabase" + ".groonga_retriever.SupabaseGroongaRetriever" + ), + "init_parameters": { + "top_k": 7, + "filters": {}, + "filter_policy": "replace", + "document_store": { + "type": ( + "haystack_integrations.document_stores.supabase" + ".groonga_document_store.SupabaseGroongaDocumentStore" + ), + "init_parameters": { + "supabase_url": "https://fake-project.supabase.co", + "supabase_key": { + "type": "env_var", + "env_vars": ["SUPABASE_SERVICE_KEY"], + "strict": True, + }, + "table_name": "test_groonga_documents", + "recreate_table": False, + }, + }, + }, + } + retriever = SupabaseGroongaRetriever.from_dict(data) + assert retriever.top_k == 7 \ No newline at end of file From 7ddf3652b9520f91b2ff9640ba53b7e4afae3192 Mon Sep 17 00:00:00 2001 From: ShubhamGond105 Date: Mon, 11 May 2026 22:22:06 +0530 Subject: [PATCH 07/34] fix(supabase): fix lint errors and add supabase test dependency --- integrations/supabase/pyproject.toml | 1 + .../retrievers/supabase/__init__.py | 6 +- .../retrievers/supabase/groonga_retriever.py | 60 ++++++------- .../document_stores/supabase/__init__.py | 4 +- .../supabase/groonga_document_store.py | 90 +++++++------------ .../tests/test_groonga_document_store.py | 39 +++----- 6 files changed, 81 insertions(+), 119 deletions(-) diff --git a/integrations/supabase/pyproject.toml b/integrations/supabase/pyproject.toml index 02cff19cb6..58099d8723 100644 --- a/integrations/supabase/pyproject.toml +++ b/integrations/supabase/pyproject.toml @@ -58,6 +58,7 @@ dependencies = [ "pytest-rerunfailures", "mypy", "pip", + "supabase", ] [tool.hatch.envs.test.scripts] diff --git a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py index 04c678c587..30c347418c 100644 --- a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py +++ b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py @@ -2,11 +2,11 @@ # # SPDX-License-Identifier: Apache-2.0 from .embedding_retriever import SupabasePgvectorEmbeddingRetriever -from .keyword_retriever import SupabasePgvectorKeywordRetriever from .groonga_retriever import SupabaseGroongaRetriever +from .keyword_retriever import SupabasePgvectorKeywordRetriever __all__ = [ + "SupabaseGroongaRetriever", "SupabasePgvectorEmbeddingRetriever", "SupabasePgvectorKeywordRetriever", - "SupabaseGroongaRetriever", -] \ No newline at end of file +] diff --git a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py index 4042661fe3..171f245b5a 100644 --- a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py +++ b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import copy -from typing import Any, Dict, List, Optional +from typing import Any from haystack import component, default_from_dict, default_to_dict from haystack.dataclasses import Document @@ -15,35 +15,35 @@ @component class SupabaseGroongaRetriever: """ - Retrieves documents from SupabaseGroongaDocumentStore using PGroonga full-text search. + Retrieves documents from SupabaseGroongaDocumentStore using PGroonga full-text search. - This retriever works without embeddings — it searches documents using plain text queries. - It can be used alongside SupabasePgvectorEmbeddingRetriever in hybrid search pipelines. + This retriever works without embeddings — it searches documents using plain text queries. + It can be used alongside SupabasePgvectorEmbeddingRetriever in hybrid search pipelines. - Example usage: + Example usage: -```python - from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore - from haystack_integrations.components.retrievers.supabase import SupabaseGroongaRetriever - from haystack.utils import Secret + ```python + from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore + from haystack_integrations.components.retrievers.supabase import SupabaseGroongaRetriever + from haystack.utils import Secret - document_store = SupabaseGroongaDocumentStore( - supabase_url="https://.supabase.co", - supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), - table_name="haystack_fts_documents", - ) + document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_fts_documents", + ) - retriever = SupabaseGroongaRetriever(document_store=document_store, top_k=10) - result = retriever.run(query="python programming") - print(result["documents"]) -``` + retriever = SupabaseGroongaRetriever(document_store=document_store, top_k=10) + result = retriever.run(query="python programming") + print(result["documents"]) + ``` """ def __init__( self, *, document_store: SupabaseGroongaDocumentStore, - filters: Optional[Dict[str, Any]] = None, + filters: dict[str, Any] | None = None, top_k: int = 10, filter_policy: str | FilterPolicy = FilterPolicy.REPLACE, ) -> None: @@ -64,18 +64,16 @@ def __init__( self.filters = filters or {} self.top_k = top_k self.filter_policy = ( - filter_policy - if isinstance(filter_policy, FilterPolicy) - else FilterPolicy.from_str(filter_policy) + filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy) ) - @component.output_types(documents=List[Document]) + @component.output_types(documents=list[Document]) def run( self, query: str, - filters: Optional[Dict[str, Any]] = None, - top_k: Optional[int] = None, - ) -> Dict[str, List[Document]]: + filters: dict[str, Any] | None = None, + top_k: int | None = None, + ) -> dict[str, list[Document]]: """ Runs the retriever on the given query. @@ -106,7 +104,7 @@ def run( return {"documents": documents} - def to_dict(self) -> Dict[str, Any]: + def to_dict(self) -> dict[str, Any]: """ Serializes the component to a dictionary. @@ -121,7 +119,7 @@ def to_dict(self) -> Dict[str, Any]: ) @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "SupabaseGroongaRetriever": + def from_dict(cls, data: dict[str, Any]) -> "SupabaseGroongaRetriever": """ Deserializes the component from a dictionary. @@ -130,9 +128,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "SupabaseGroongaRetriever": """ data = copy.deepcopy(data) doc_store_params = data["init_parameters"]["document_store"] - data["init_parameters"]["document_store"] = SupabaseGroongaDocumentStore.from_dict( - doc_store_params - ) + data["init_parameters"]["document_store"] = SupabaseGroongaDocumentStore.from_dict(doc_store_params) if filter_policy := data["init_parameters"].get("filter_policy"): data["init_parameters"]["filter_policy"] = FilterPolicy.from_str(filter_policy) - return default_from_dict(cls, data) \ No newline at end of file + return default_from_dict(cls, data) diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/__init__.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/__init__.py index d989957a91..06d697a980 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/__init__.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/__init__.py @@ -5,6 +5,6 @@ from .groonga_document_store import SupabaseGroongaDocumentStore __all__ = [ - "SupabasePgvectorDocumentStore", "SupabaseGroongaDocumentStore", -] \ No newline at end of file + "SupabasePgvectorDocumentStore", +] diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py index 4e7ca24d39..cd425f95e0 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional +from typing import Any from haystack import default_from_dict, default_to_dict, logging from haystack.dataclasses import Document @@ -17,28 +17,28 @@ class SupabaseGroongaDocumentStore: """ - A Document Store for Supabase using PGroonga for full-text search. + A Document Store for Supabase using PGroonga for full-text search. - PGroonga is a PostgreSQL extension for fast, multilingual full-text search. - Unlike vector search, this store works with plain text queries — no embeddings needed. + PGroonga is a PostgreSQL extension for fast, multilingual full-text search. + Unlike vector search, this store works with plain text queries — no embeddings needed. - Prerequisites: - - A Supabase project with PGroonga extension enabled. - - Enable PGroonga in your Supabase project by running: - `CREATE EXTENSION IF NOT EXISTS pgroonga;` + Prerequisites: + - A Supabase project with PGroonga extension enabled. + - Enable PGroonga in your Supabase project by running: + `CREATE EXTENSION IF NOT EXISTS pgroonga;` - Example usage: + Example usage: -```python - from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore - from haystack.utils import Secret + ```python + from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore + from haystack.utils import Secret - document_store = SupabaseGroongaDocumentStore( - supabase_url="https://.supabase.co", - supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), - table_name="haystack_fts_documents", - ) -``` + document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_fts_documents", + ) + ``` """ def __init__( @@ -76,13 +76,11 @@ def __init__( def _setup_table(self) -> None: """ Creates the documents table with PGroonga index if it does not exist. + If recreate_table is True, drops and recreates the table. """ if self.recreate_table: - self._client.rpc( - "exec_sql", - {"query": f"DROP TABLE IF EXISTS {self.table_name};"} - ).execute() + self._client.rpc("exec_sql", {"query": f"DROP TABLE IF EXISTS {self.table_name};"}).execute() # Create table if not exists create_table_sql = f""" @@ -112,7 +110,7 @@ def count_documents(self) -> int: result = self._client.table(self.table_name).select("id", count="exact").execute() return result.count or 0 - def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: + def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Document]: # noqa: ARG002 """ Returns documents matching the given filters. @@ -125,7 +123,7 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc def write_documents( self, - documents: List[Document], + documents: list[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE, ) -> int: """ @@ -150,26 +148,15 @@ def write_documents( self._client.table(self.table_name).upsert(row).execute() written += 1 elif policy == DuplicatePolicy.SKIP: - existing = ( - self._client.table(self.table_name) - .select("id") - .eq("id", doc.id) - .execute() - ) + existing = self._client.table(self.table_name).select("id").eq("id", doc.id).execute() if not existing.data: self._client.table(self.table_name).insert(row).execute() written += 1 elif policy == DuplicatePolicy.FAIL: - existing = ( - self._client.table(self.table_name) - .select("id") - .eq("id", doc.id) - .execute() - ) + existing = self._client.table(self.table_name).select("id").eq("id", doc.id).execute() if existing.data: - raise DuplicateDocumentError( - f"Document with id {doc.id!r} already exists." - ) + msg = f"Document with id {doc.id!r} already exists." + raise DuplicateDocumentError(msg) self._client.table(self.table_name).insert(row).execute() written += 1 else: @@ -178,7 +165,7 @@ def write_documents( return written - def delete_documents(self, document_ids: List[str]) -> None: + def delete_documents(self, document_ids: list[str]) -> None: """ Deletes documents with the given IDs. @@ -192,8 +179,8 @@ def _groonga_retrieval( self, query: str, top_k: int = 10, - filters: Optional[Dict[str, Any]] = None, - ) -> List[Document]: + filters: dict[str, Any] | None = None, # noqa: ARG002 + ) -> list[Document]: """ Searches documents using PGroonga full-text search. @@ -202,22 +189,13 @@ def _groonga_retrieval( :param filters: Optional filters to apply. :returns: List of matching Document objects ranked by relevance. """ - search_sql = f""" - SELECT id, content, meta, - pgroonga_score(tableoid, ctid) AS score - FROM {self.table_name} - WHERE content &@~ %s - ORDER BY score DESC - LIMIT %s; - """ result = self._client.rpc( - "groonga_search", - {"query_text": query, "table": self.table_name, "top_k": top_k} + "groonga_search", {"query_text": query, "table": self.table_name, "top_k": top_k} ).execute() return [self._to_haystack_document(row) for row in result.data] - def _to_haystack_document(self, row: Dict[str, Any]) -> Document: + def _to_haystack_document(self, row: dict[str, Any]) -> Document: """ Converts a database row dictionary into a Haystack Document. @@ -231,7 +209,7 @@ def _to_haystack_document(self, row: Dict[str, Any]) -> Document: score=row.get("score"), ) - def to_dict(self) -> Dict[str, Any]: + def to_dict(self) -> dict[str, Any]: """ Serializes the component to a dictionary. @@ -246,7 +224,7 @@ def to_dict(self) -> Dict[str, Any]: ) @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "SupabaseGroongaDocumentStore": + def from_dict(cls, data: dict[str, Any]) -> "SupabaseGroongaDocumentStore": """ Deserializes the component from a dictionary. @@ -254,4 +232,4 @@ def from_dict(cls, data: Dict[str, Any]) -> "SupabaseGroongaDocumentStore": :returns: Deserialized component. """ deserialize_secrets_inplace(data["init_parameters"], ["supabase_key"]) - return default_from_dict(cls, data) \ No newline at end of file + return default_from_dict(cls, data) diff --git a/integrations/supabase/tests/test_groonga_document_store.py b/integrations/supabase/tests/test_groonga_document_store.py index ba9f92232f..bc8273b389 100644 --- a/integrations/supabase/tests/test_groonga_document_store.py +++ b/integrations/supabase/tests/test_groonga_document_store.py @@ -3,27 +3,23 @@ # SPDX-License-Identifier: Apache-2.0 from unittest.mock import MagicMock, patch -from typing import Any, Dict import pytest from haystack.dataclasses import Document from haystack.document_stores.types import DuplicatePolicy -from haystack.utils.auth import Secret -from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore from haystack_integrations.components.retrievers.supabase import SupabaseGroongaRetriever - +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore # ───────────────────────────────────────────── # FIXTURES # ───────────────────────────────────────────── + @pytest.fixture def mock_supabase_client(): """Creates a mock Supabase client so we never hit a real database.""" - with patch( - "haystack_integrations.document_stores.supabase.groonga_document_store.create_client" - ) as mock_create: + with patch("haystack_integrations.document_stores.supabase.groonga_document_store.create_client") as mock_create: mock_client = MagicMock() mock_create.return_value = mock_client @@ -60,6 +56,7 @@ def groonga_store(mock_supabase_client, monkeypatch): # noqa: ARG001 # DOCUMENT STORE TESTS # ───────────────────────────────────────────── + def test_init_defaults(mock_supabase_client, monkeypatch): # noqa: ARG001 """Test that default parameters are set correctly.""" monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") @@ -91,18 +88,14 @@ def test_init_invalid_store(): def test_count_documents(groonga_store, mock_supabase_client): """Test count_documents returns correct number.""" - mock_supabase_client.table.return_value.select.return_value.execute.return_value = MagicMock( - count=5 - ) + mock_supabase_client.table.return_value.select.return_value.execute.return_value = MagicMock(count=5) count = groonga_store.count_documents() assert count == 5 def test_count_documents_empty(groonga_store, mock_supabase_client): """Test count_documents returns 0 when store is empty.""" - mock_supabase_client.table.return_value.select.return_value.execute.return_value = MagicMock( - count=0 - ) + mock_supabase_client.table.return_value.select.return_value.execute.return_value = MagicMock(count=0) count = groonga_store.count_documents() assert count == 0 @@ -170,12 +163,12 @@ def test_filter_documents(groonga_store, mock_supabase_client): # SERIALIZATION TESTS # ───────────────────────────────────────────── + def test_to_dict(groonga_store): """Test that to_dict returns correct dictionary.""" result = groonga_store.to_dict() assert result["type"] == ( - "haystack_integrations.document_stores.supabase" - ".groonga_document_store.SupabaseGroongaDocumentStore" + "haystack_integrations.document_stores.supabase.groonga_document_store.SupabaseGroongaDocumentStore" ) assert result["init_parameters"]["table_name"] == "test_groonga_documents" assert result["init_parameters"]["supabase_url"] == "https://fake-project.supabase.co" @@ -186,10 +179,7 @@ def test_from_dict(mock_supabase_client, monkeypatch): # noqa: ARG001 """Test that from_dict recreates the store correctly.""" monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") data = { - "type": ( - "haystack_integrations.document_stores.supabase" - ".groonga_document_store.SupabaseGroongaDocumentStore" - ), + "type": ("haystack_integrations.document_stores.supabase.groonga_document_store.SupabaseGroongaDocumentStore"), "init_parameters": { "supabase_url": "https://fake-project.supabase.co", "supabase_key": { @@ -210,6 +200,7 @@ def test_from_dict(mock_supabase_client, monkeypatch): # noqa: ARG001 # RETRIEVER TESTS # ───────────────────────────────────────────── + def test_retriever_init(groonga_store): """Test that retriever initializes correctly.""" retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) @@ -254,18 +245,14 @@ def test_retriever_from_dict(mock_supabase_client, monkeypatch): # noqa: ARG001 """Test that retriever deserializes correctly.""" monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") data = { - "type": ( - "haystack_integrations.components.retrievers.supabase" - ".groonga_retriever.SupabaseGroongaRetriever" - ), + "type": ("haystack_integrations.components.retrievers.supabase.groonga_retriever.SupabaseGroongaRetriever"), "init_parameters": { "top_k": 7, "filters": {}, "filter_policy": "replace", "document_store": { "type": ( - "haystack_integrations.document_stores.supabase" - ".groonga_document_store.SupabaseGroongaDocumentStore" + "haystack_integrations.document_stores.supabase.groonga_document_store.SupabaseGroongaDocumentStore" ), "init_parameters": { "supabase_url": "https://fake-project.supabase.co", @@ -281,4 +268,4 @@ def test_retriever_from_dict(mock_supabase_client, monkeypatch): # noqa: ARG001 }, } retriever = SupabaseGroongaRetriever.from_dict(data) - assert retriever.top_k == 7 \ No newline at end of file + assert retriever.top_k == 7 From e8f53bb181f74e8a7e981d990fbc621cd34ddcb9 Mon Sep 17 00:00:00 2001 From: ShubhamGond105 Date: Mon, 11 May 2026 22:40:15 +0530 Subject: [PATCH 08/34] fix(supabase): fix mypy type errors in groonga document store --- .../supabase/groonga_document_store.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py index cd425f95e0..be5e219a57 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -10,7 +10,7 @@ from haystack.document_stores.types import DuplicatePolicy from haystack.utils.auth import Secret, deserialize_secrets_inplace -from supabase import Client, create_client +from supabase import create_client logger = logging.getLogger(__name__) @@ -67,8 +67,8 @@ def __init__( self.recreate_table = recreate_table # Connect to Supabase - resolved_key = supabase_key.resolve_value() - self._client: Client = create_client(supabase_url, resolved_key) + key = self.supabase_key.resolve_value() or "" + self._client = create_client(self.supabase_url, key) # Set up the table self._setup_table() @@ -107,8 +107,8 @@ def count_documents(self) -> int: :returns: Number of documents. """ - result = self._client.table(self.table_name).select("id", count="exact").execute() - return result.count or 0 + result = self._client.table(self.table_name).select("*").execute() + return len(result.data) def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Document]: # noqa: ARG002 """ @@ -119,7 +119,7 @@ def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Docume """ query = self._client.table(self.table_name).select("*") result = query.execute() - return [self._to_haystack_document(row) for row in result.data] + return [self._to_haystack_document(row) for row in result.data if isinstance(row, dict)] def write_documents( self, @@ -193,7 +193,7 @@ def _groonga_retrieval( "groonga_search", {"query_text": query, "table": self.table_name, "top_k": top_k} ).execute() - return [self._to_haystack_document(row) for row in result.data] + return [self._to_haystack_document(row) for row in result.data if isinstance(row, dict)] def _to_haystack_document(self, row: dict[str, Any]) -> Document: """ From 446088cf9812ee759d991758e0c9fb7e972d721c Mon Sep 17 00:00:00 2001 From: ShubhamGond105 Date: Mon, 11 May 2026 22:52:54 +0530 Subject: [PATCH 09/34] fix(supabase): fix mypy union-attr error and count_documents implementation --- .../document_stores/supabase/groonga_document_store.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py index be5e219a57..db413c74b3 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -103,12 +103,12 @@ def _setup_table(self) -> None: def count_documents(self) -> int: """ - Returns the number of documents in the store. + the number of documents in the store. :returns: Number of documents. """ - result = self._client.table(self.table_name).select("*").execute() - return len(result.data) + result = self._client.table(self.table_name).select("*", count="exact").execute() + return int(result.count) if result.count is not None else 0 def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Document]: # noqa: ARG002 """ @@ -193,7 +193,7 @@ def _groonga_retrieval( "groonga_search", {"query_text": query, "table": self.table_name, "top_k": top_k} ).execute() - return [self._to_haystack_document(row) for row in result.data if isinstance(row, dict)] + return [self._to_haystack_document(row) for row in (result.data or []) if isinstance(row, dict)] def _to_haystack_document(self, row: dict[str, Any]) -> Document: """ From 3ae853c505c76691eae0a81aed0108bdd2a9f9ac Mon Sep 17 00:00:00 2001 From: ShubhamGond105 Date: Tue, 19 May 2026 21:19:11 +0530 Subject: [PATCH 10/34] fix(supabase): address reviewer feedback - lazy init, DocumentStore base class, filters, async --- .../retrievers/supabase/groonga_retriever.py | 85 ++++++--- .../supabase/groonga_document_store.py | 168 ++++++++++++++---- .../tests/test_groonga_document_store.py | 96 +++++++++- 3 files changed, 283 insertions(+), 66 deletions(-) diff --git a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py index 171f245b5a..ccf4da6976 100644 --- a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py +++ b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py @@ -15,27 +15,31 @@ @component class SupabaseGroongaRetriever: """ - Retrieves documents from SupabaseGroongaDocumentStore using PGroonga full-text search. + Retrieves documents from SupabaseGroongaDocumentStore using PGroonga full-text search. - This retriever works without embeddings — it searches documents using plain text queries. - It can be used alongside SupabasePgvectorEmbeddingRetriever in hybrid search pipelines. + This retriever works without embeddings — it searches documents using plain text queries. + It can be used alongside SupabasePgvectorEmbeddingRetriever in hybrid search pipelines. - Example usage: + Note: async operations are not supported as the supabase-py sync client does not expose + awaitable query methods. Use the sync run() method instead. - ```python - from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore - from haystack_integrations.components.retrievers.supabase import SupabaseGroongaRetriever - from haystack.utils import Secret - - document_store = SupabaseGroongaDocumentStore( - supabase_url="https://.supabase.co", - supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), - table_name="haystack_fts_documents", - ) + Example usage: - retriever = SupabaseGroongaRetriever(document_store=document_store, top_k=10) - result = retriever.run(query="python programming") - print(result["documents"]) + ```python + from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore + from haystack_integrations.components.retrievers.supabase import SupabaseGroongaRetriever + from haystack.utils import Secret + + document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_fts_documents", + ) + document_store.warm_up() + + retriever = SupabaseGroongaRetriever(document_store=document_store, top_k=10) + result = retriever.run(query="python programming") + print(result["documents"]) ``` """ @@ -85,15 +89,7 @@ def run( if not query: return {"documents": []} - # Handle filter policy - if filters is not None: - if self.filter_policy == FilterPolicy.MERGE: - merged_filters = {**self.filters, **filters} - else: - merged_filters = filters - else: - merged_filters = self.filters - + merged_filters = self._merge_filters(filters) effective_top_k = top_k if top_k is not None else self.top_k documents = self.document_store._groonga_retrieval( @@ -104,6 +100,41 @@ def run( return {"documents": documents} + @component.output_types(documents=list[Document]) + async def run_async( + self, + query: str, + filters: dict[str, Any] | None = None, + top_k: int | None = None, + ) -> dict[str, list[Document]]: + """ + Async version of run(). + + Note: supabase-py's sync client does not support native async queries. + This method runs the synchronous retrieval and returns the result. + For fully async support, consider using acreate_client() from supabase-py + and refactoring the document store accordingly. + + :param query: The text query to search for. + :param filters: Optional runtime filters. Merged or replaced based on filter_policy. + :param top_k: Optional override for maximum number of documents to return. + :returns: Dictionary with key "documents" containing list of matching Documents. + """ + return self.run(query=query, filters=filters, top_k=top_k) + + def _merge_filters(self, runtime_filters: dict[str, Any] | None) -> dict[str, Any]: + """ + Merges runtime filters with init filters based on filter_policy. + + :param runtime_filters: Filters passed at runtime. + :returns: Merged filters dictionary. + """ + if runtime_filters is not None: + if self.filter_policy == FilterPolicy.MERGE: + return {**self.filters, **runtime_filters} + return runtime_filters + return self.filters + def to_dict(self) -> dict[str, Any]: """ Serializes the component to a dictionary. @@ -131,4 +162,4 @@ def from_dict(cls, data: dict[str, Any]) -> "SupabaseGroongaRetriever": data["init_parameters"]["document_store"] = SupabaseGroongaDocumentStore.from_dict(doc_store_params) if filter_policy := data["init_parameters"].get("filter_policy"): data["init_parameters"]["filter_policy"] = FilterPolicy.from_str(filter_policy) - return default_from_dict(cls, data) + return default_from_dict(cls, data) \ No newline at end of file diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py index db413c74b3..cadd2a5b75 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -2,42 +2,43 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Any +from typing import Any, Optional from haystack import default_from_dict, default_to_dict, logging from haystack.dataclasses import Document from haystack.document_stores.errors import DuplicateDocumentError -from haystack.document_stores.types import DuplicatePolicy +from haystack.document_stores.types import DocumentStore, DuplicatePolicy from haystack.utils.auth import Secret, deserialize_secrets_inplace -from supabase import create_client +from supabase import create_client, Client logger = logging.getLogger(__name__) -class SupabaseGroongaDocumentStore: +class SupabaseGroongaDocumentStore(DocumentStore): """ - A Document Store for Supabase using PGroonga for full-text search. + A Document Store for Supabase using PGroonga for full-text search. - PGroonga is a PostgreSQL extension for fast, multilingual full-text search. - Unlike vector search, this store works with plain text queries — no embeddings needed. + PGroonga is a PostgreSQL extension for fast, multilingual full-text search. + Unlike vector search, this store works with plain text queries — no embeddings needed. - Prerequisites: - - A Supabase project with PGroonga extension enabled. - - Enable PGroonga in your Supabase project by running: - `CREATE EXTENSION IF NOT EXISTS pgroonga;` + Prerequisites: + - A Supabase project with PGroonga extension enabled. + - Enable PGroonga in your Supabase project by running: + `CREATE EXTENSION IF NOT EXISTS pgroonga;` - Example usage: + Example usage: ```python - from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore - from haystack.utils import Secret - - document_store = SupabaseGroongaDocumentStore( - supabase_url="https://.supabase.co", - supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), - table_name="haystack_fts_documents", - ) + from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore + from haystack.utils import Secret + + document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_fts_documents", + ) + document_store.warm_up() ``` """ @@ -52,6 +53,8 @@ def __init__( """ Creates a new SupabaseGroongaDocumentStore instance. + Note: Call warm_up() before using the store to initialize the client and table. + :param supabase_url: The URL of your Supabase project. Format: `https://.supabase.co` :param supabase_key: The service role key for your Supabase project. @@ -66,11 +69,17 @@ def __init__( self.table_name = table_name self.recreate_table = recreate_table - # Connect to Supabase + # Client is initialized lazily in warm_up() + self._client: Optional[Client] = None + + def warm_up(self) -> None: + """ + Initializes the Supabase client and sets up the table. + + Must be called before using the document store. + """ key = self.supabase_key.resolve_value() or "" self._client = create_client(self.supabase_url, key) - - # Set up the table self._setup_table() def _setup_table(self) -> None: @@ -79,6 +88,8 @@ def _setup_table(self) -> None: If recreate_table is True, drops and recreates the table. """ + assert self._client is not None, "Call warm_up() before using the document store." + if self.recreate_table: self._client.rpc("exec_sql", {"query": f"DROP TABLE IF EXISTS {self.table_name};"}).execute() @@ -103,36 +114,80 @@ def _setup_table(self) -> None: def count_documents(self) -> int: """ - the number of documents in the store. + Returns the number of documents in the store. :returns: Number of documents. """ + assert self._client is not None, "Call warm_up() before using the document store." result = self._client.table(self.table_name).select("*", count="exact").execute() return int(result.count) if result.count is not None else 0 - def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Document]: # noqa: ARG002 + def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Document]: """ Returns documents matching the given filters. - :param filters: Optional dictionary of filters. + Supported filters: equality filters on `id`, `content`, and `meta` fields. + + :param filters: Optional dictionary of filters. Example: {"field": "meta.language", "operator": "==", "value": "en"} :returns: List of matching Document objects. """ + assert self._client is not None, "Call warm_up() before using the document store." + query = self._client.table(self.table_name).select("*") + + if filters: + query = self._apply_filters(query, filters) + result = query.execute() return [self._to_haystack_document(row) for row in result.data if isinstance(row, dict)] + def _apply_filters(self, query: Any, filters: dict[str, Any]) -> Any: + """ + Applies filters to a Supabase query. + + :param query: The Supabase query builder. + :param filters: Dictionary of filters to apply. + :returns: The query with filters applied. + """ + operator = filters.get("operator", "AND") + conditions = filters.get("conditions", []) + + for condition in conditions: + field = condition.get("field", "") + op = condition.get("operator", "==") + value = condition.get("value") + + # Handle nested meta fields e.g. "meta.language" + if field.startswith("meta."): + meta_key = field[len("meta."):] + if op == "==": + query = query.eq(f"meta->>'{meta_key}'", value) + elif op == "!=": + query = query.neq(f"meta->>'{meta_key}'", value) + else: + if op == "==": + query = query.eq(field, value) + elif op == "!=": + query = query.neq(field, value) + elif op == "in": + query = query.in_(field, value) + + return query + def write_documents( self, documents: list[Document], - policy: DuplicatePolicy = DuplicatePolicy.NONE, + policy: DuplicatePolicy = DuplicatePolicy.FAIL, ) -> int: """ Writes documents to the store. :param documents: List of Haystack Document objects to write. - :param policy: How to handle duplicate documents. + :param policy: How to handle duplicate documents. Defaults to DuplicatePolicy.FAIL. :returns: Number of documents written. """ + assert self._client is not None, "Call warm_up() before using the document store." + if not documents: return 0 @@ -171,6 +226,8 @@ def delete_documents(self, document_ids: list[str]) -> None: :param document_ids: List of document IDs to delete. """ + assert self._client is not None, "Call warm_up() before using the document store." + if not document_ids: return self._client.table(self.table_name).delete().in_("id", document_ids).execute() @@ -179,21 +236,68 @@ def _groonga_retrieval( self, query: str, top_k: int = 10, - filters: dict[str, Any] | None = None, # noqa: ARG002 + filters: dict[str, Any] | None = None, ) -> list[Document]: """ Searches documents using PGroonga full-text search. :param query: The text query to search for. :param top_k: Maximum number of results to return. - :param filters: Optional filters to apply. + :param filters: Optional filters to apply after retrieval. :returns: List of matching Document objects ranked by relevance. """ + assert self._client is not None, "Call warm_up() before using the document store." + result = self._client.rpc( "groonga_search", {"query_text": query, "table": self.table_name, "top_k": top_k} ).execute() - return [self._to_haystack_document(row) for row in (result.data or []) if isinstance(row, dict)] + documents = [self._to_haystack_document(row) for row in (result.data or []) if isinstance(row, dict)] + + # Apply filters post-retrieval if provided + if filters: + documents = self._filter_documents_in_memory(documents, filters) + + return documents + + def _filter_documents_in_memory(self, documents: list[Document], filters: dict[str, Any]) -> list[Document]: + """ + Filters a list of documents in memory based on the given filters. + + :param documents: List of documents to filter. + :param filters: Dictionary of filters to apply. + :returns: Filtered list of documents. + """ + conditions = filters.get("conditions", []) + filtered = [] + + for doc in documents: + match = True + for condition in conditions: + field = condition.get("field", "") + op = condition.get("operator", "==") + value = condition.get("value") + + if field.startswith("meta."): + meta_key = field[len("meta."):] + doc_value = doc.meta.get(meta_key) + else: + doc_value = getattr(doc, field, None) + + if op == "==" and doc_value != value: + match = False + break + elif op == "!=" and doc_value == value: + match = False + break + elif op == "in" and doc_value not in value: + match = False + break + + if match: + filtered.append(doc) + + return filtered def _to_haystack_document(self, row: dict[str, Any]) -> Document: """ @@ -232,4 +336,4 @@ def from_dict(cls, data: dict[str, Any]) -> "SupabaseGroongaDocumentStore": :returns: Deserialized component. """ deserialize_secrets_inplace(data["init_parameters"], ["supabase_key"]) - return default_from_dict(cls, data) + return default_from_dict(cls, data) \ No newline at end of file diff --git a/integrations/supabase/tests/test_groonga_document_store.py b/integrations/supabase/tests/test_groonga_document_store.py index bc8273b389..f652344dac 100644 --- a/integrations/supabase/tests/test_groonga_document_store.py +++ b/integrations/supabase/tests/test_groonga_document_store.py @@ -42,13 +42,14 @@ def mock_supabase_client(): @pytest.fixture def groonga_store(mock_supabase_client, monkeypatch): # noqa: ARG001 - """Creates a SupabaseGroongaDocumentStore with mocked client.""" + """Creates a SupabaseGroongaDocumentStore with mocked client and calls warm_up().""" monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") store = SupabaseGroongaDocumentStore( supabase_url="https://fake-project.supabase.co", table_name="test_groonga_documents", recreate_table=False, ) + store.warm_up() # must call warm_up() before using the store return store @@ -57,8 +58,8 @@ def groonga_store(mock_supabase_client, monkeypatch): # noqa: ARG001 # ───────────────────────────────────────────── -def test_init_defaults(mock_supabase_client, monkeypatch): # noqa: ARG001 - """Test that default parameters are set correctly.""" +def test_init_defaults(monkeypatch): + """Test that default parameters are set correctly without connecting.""" monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") store = SupabaseGroongaDocumentStore( supabase_url="https://fake-project.supabase.co", @@ -66,10 +67,12 @@ def test_init_defaults(mock_supabase_client, monkeypatch): # noqa: ARG001 assert store.table_name == "haystack_groonga_documents" assert store.recreate_table is False assert store.supabase_url == "https://fake-project.supabase.co" + # client should be None before warm_up() + assert store._client is None -def test_init_custom_params(mock_supabase_client, monkeypatch): # noqa: ARG001 - """Test that custom parameters are set correctly.""" +def test_init_custom_params(monkeypatch): + """Test that custom parameters are set correctly without connecting.""" monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") store = SupabaseGroongaDocumentStore( supabase_url="https://fake-project.supabase.co", @@ -78,6 +81,19 @@ def test_init_custom_params(mock_supabase_client, monkeypatch): # noqa: ARG001 ) assert store.table_name == "my_custom_table" assert store.recreate_table is True + # client should be None before warm_up() + assert store._client is None + + +def test_warm_up_initializes_client(mock_supabase_client, monkeypatch): # noqa: ARG001 + """Test that warm_up() initializes the client.""" + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + store = SupabaseGroongaDocumentStore( + supabase_url="https://fake-project.supabase.co", + ) + assert store._client is None + store.warm_up() + assert store._client is not None def test_init_invalid_store(): @@ -103,13 +119,16 @@ def test_count_documents_empty(groonga_store, mock_supabase_client): def test_write_documents(groonga_store, mock_supabase_client): """Test that write_documents writes correct number of documents.""" mock_table = mock_supabase_client.table.return_value + # mock select chain so no existing docs are found + mock_table.select.return_value.eq.return_value.execute.return_value = MagicMock(data=[]) mock_table.insert.return_value.execute.return_value = MagicMock(data=[{}]) documents = [ Document(content="Python is great"), Document(content="Haystack is a RAG framework"), ] - written = groonga_store.write_documents(documents) + # use OVERWRITE to avoid the duplicate check entirely + written = groonga_store.write_documents(documents, policy=DuplicatePolicy.OVERWRITE) assert written == 2 @@ -130,6 +149,30 @@ def test_write_documents_overwrite(groonga_store, mock_supabase_client): mock_table.upsert.assert_called_once() +def test_write_documents_skip(groonga_store, mock_supabase_client): + """Test that skip policy skips existing documents.""" + mock_table = mock_supabase_client.table.return_value + # simulate document already exists + mock_table.select.return_value.eq.return_value.execute.return_value = MagicMock(data=[{"id": "existing"}]) + + documents = [Document(content="already exists")] + written = groonga_store.write_documents(documents, policy=DuplicatePolicy.SKIP) + assert written == 0 + + +def test_write_documents_fail_on_duplicate(groonga_store, mock_supabase_client): + """Test that FAIL policy raises error on duplicate.""" + from haystack.document_stores.errors import DuplicateDocumentError + + mock_table = mock_supabase_client.table.return_value + # simulate document already exists + mock_table.select.return_value.eq.return_value.execute.return_value = MagicMock(data=[{"id": "existing"}]) + + doc = Document(content="duplicate doc") + with pytest.raises(DuplicateDocumentError): + groonga_store.write_documents([doc], policy=DuplicatePolicy.FAIL) + + def test_delete_documents(groonga_store, mock_supabase_client): """Test that delete_documents calls delete with correct IDs.""" mock_table = mock_supabase_client.table.return_value @@ -159,6 +202,19 @@ def test_filter_documents(groonga_store, mock_supabase_client): assert docs[1].content == "Haystack rocks" +def test_filter_documents_with_filters(groonga_store, mock_supabase_client): + """Test that filter_documents applies filters correctly.""" + mock_table = mock_supabase_client.table.return_value + mock_table.select.return_value.eq.return_value.execute.return_value = MagicMock( + data=[ + {"id": "1", "content": "Python is great", "meta": {"language": "en"}, "score": None}, + ] + ) + filters = {"conditions": [{"field": "meta.language", "operator": "==", "value": "en"}]} + docs = groonga_store.filter_documents(filters=filters) + assert len(docs) == 1 + + # ───────────────────────────────────────────── # SERIALIZATION TESTS # ───────────────────────────────────────────── @@ -194,6 +250,8 @@ def test_from_dict(mock_supabase_client, monkeypatch): # noqa: ARG001 store = SupabaseGroongaDocumentStore.from_dict(data) assert store.table_name == "test_groonga_documents" assert store.supabase_url == "https://fake-project.supabase.co" + # client should be None — warm_up() not called yet + assert store._client is None # ───────────────────────────────────────────── @@ -231,6 +289,30 @@ def test_retriever_run(groonga_store, mock_supabase_client): retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) result = retriever.run(query="Python") assert "documents" in result + assert len(result["documents"]) == 1 + assert result["documents"][0].content == "Python is great" + + +@pytest.mark.asyncio +async def test_retriever_run_async(groonga_store, mock_supabase_client): + """Test that async retriever run returns same result as sync run.""" + mock_supabase_client.rpc.return_value.execute.return_value = MagicMock( + data=[ + {"id": "1", "content": "Python is great", "meta": {}, "score": 1.0}, + ] + ) + retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) + result = await retriever.run_async(query="Python") + assert "documents" in result + assert len(result["documents"]) == 1 + + +@pytest.mark.asyncio +async def test_retriever_run_async_empty_query(groonga_store): + """Test that empty query in async run returns empty documents.""" + retriever = SupabaseGroongaRetriever(document_store=groonga_store) + result = await retriever.run_async(query="") + assert result == {"documents": []} def test_retriever_to_dict(groonga_store): @@ -268,4 +350,4 @@ def test_retriever_from_dict(mock_supabase_client, monkeypatch): # noqa: ARG001 }, } retriever = SupabaseGroongaRetriever.from_dict(data) - assert retriever.top_k == 7 + assert retriever.top_k == 7 \ No newline at end of file From 5be352e95ea7fce3137512ef349981e0b19ae98d Mon Sep 17 00:00:00 2001 From: ShubhamGond105 Date: Tue, 19 May 2026 21:44:08 +0530 Subject: [PATCH 11/34] fix(supabase): fix lint errors - imports, assert, formatting --- .../retrievers/supabase/groonga_retriever.py | 4 +- .../supabase/groonga_document_store.py | 46 ++++++++++++------- .../tests/test_groonga_document_store.py | 5 +- 3 files changed, 33 insertions(+), 22 deletions(-) diff --git a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py index ccf4da6976..e71a059f03 100644 --- a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py +++ b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py @@ -108,7 +108,7 @@ async def run_async( top_k: int | None = None, ) -> dict[str, list[Document]]: """ - Async version of run(). + Async version of run(). Note: supabase-py's sync client does not support native async queries. This method runs the synchronous retrieval and returns the result. @@ -162,4 +162,4 @@ def from_dict(cls, data: dict[str, Any]) -> "SupabaseGroongaRetriever": data["init_parameters"]["document_store"] = SupabaseGroongaDocumentStore.from_dict(doc_store_params) if filter_policy := data["init_parameters"].get("filter_policy"): data["init_parameters"]["filter_policy"] = FilterPolicy.from_str(filter_policy) - return default_from_dict(cls, data) \ No newline at end of file + return default_from_dict(cls, data) diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py index cadd2a5b75..522a21b406 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Optional +from typing import Any from haystack import default_from_dict, default_to_dict, logging from haystack.dataclasses import Document @@ -10,7 +10,7 @@ from haystack.document_stores.types import DocumentStore, DuplicatePolicy from haystack.utils.auth import Secret, deserialize_secrets_inplace -from supabase import create_client, Client +from supabase import Client, create_client logger = logging.getLogger(__name__) @@ -70,7 +70,7 @@ def __init__( self.recreate_table = recreate_table # Client is initialized lazily in warm_up() - self._client: Optional[Client] = None + self._client: Client | None = None def warm_up(self) -> None: """ @@ -88,7 +88,9 @@ def _setup_table(self) -> None: If recreate_table is True, drops and recreates the table. """ - assert self._client is not None, "Call warm_up() before using the document store." + if self._client is None: + msg = "Call warm_up() before using the document store." + raise RuntimeError(msg) if self.recreate_table: self._client.rpc("exec_sql", {"query": f"DROP TABLE IF EXISTS {self.table_name};"}).execute() @@ -118,7 +120,9 @@ def count_documents(self) -> int: :returns: Number of documents. """ - assert self._client is not None, "Call warm_up() before using the document store." + if self._client is None: + msg = "Call warm_up() before using the document store." + raise RuntimeError(msg) result = self._client.table(self.table_name).select("*", count="exact").execute() return int(result.count) if result.count is not None else 0 @@ -128,10 +132,13 @@ def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Docume Supported filters: equality filters on `id`, `content`, and `meta` fields. - :param filters: Optional dictionary of filters. Example: {"field": "meta.language", "operator": "==", "value": "en"} + :param filters: Optional dictionary of filters. + Example: ``{"field": "meta.language", "operator": "==", "value": "en"}`` :returns: List of matching Document objects. """ - assert self._client is not None, "Call warm_up() before using the document store." + if self._client is None: + msg = "Call warm_up() before using the document store." + raise RuntimeError(msg) query = self._client.table(self.table_name).select("*") @@ -149,7 +156,6 @@ def _apply_filters(self, query: Any, filters: dict[str, Any]) -> Any: :param filters: Dictionary of filters to apply. :returns: The query with filters applied. """ - operator = filters.get("operator", "AND") conditions = filters.get("conditions", []) for condition in conditions: @@ -159,13 +165,12 @@ def _apply_filters(self, query: Any, filters: dict[str, Any]) -> Any: # Handle nested meta fields e.g. "meta.language" if field.startswith("meta."): - meta_key = field[len("meta."):] + meta_key = field[len("meta.") :] if op == "==": query = query.eq(f"meta->>'{meta_key}'", value) elif op == "!=": query = query.neq(f"meta->>'{meta_key}'", value) - else: - if op == "==": + elif op == "==": query = query.eq(field, value) elif op == "!=": query = query.neq(field, value) @@ -186,7 +191,9 @@ def write_documents( :param policy: How to handle duplicate documents. Defaults to DuplicatePolicy.FAIL. :returns: Number of documents written. """ - assert self._client is not None, "Call warm_up() before using the document store." + if self._client is None: + msg = "Call warm_up() before using the document store." + raise RuntimeError(msg) if not documents: return 0 @@ -226,7 +233,9 @@ def delete_documents(self, document_ids: list[str]) -> None: :param document_ids: List of document IDs to delete. """ - assert self._client is not None, "Call warm_up() before using the document store." + if self._client is None: + msg = "Call warm_up() before using the document store." + raise RuntimeError(msg) if not document_ids: return @@ -246,10 +255,13 @@ def _groonga_retrieval( :param filters: Optional filters to apply after retrieval. :returns: List of matching Document objects ranked by relevance. """ - assert self._client is not None, "Call warm_up() before using the document store." + if self._client is None: + msg = "Call warm_up() before using the document store." + raise RuntimeError(msg) result = self._client.rpc( - "groonga_search", {"query_text": query, "table": self.table_name, "top_k": top_k} + "groonga_search", + {"query_text": query, "table": self.table_name, "top_k": top_k}, ).execute() documents = [self._to_haystack_document(row) for row in (result.data or []) if isinstance(row, dict)] @@ -279,7 +291,7 @@ def _filter_documents_in_memory(self, documents: list[Document], filters: dict[s value = condition.get("value") if field.startswith("meta."): - meta_key = field[len("meta."):] + meta_key = field[len("meta.") :] doc_value = doc.meta.get(meta_key) else: doc_value = getattr(doc, field, None) @@ -336,4 +348,4 @@ def from_dict(cls, data: dict[str, Any]) -> "SupabaseGroongaDocumentStore": :returns: Deserialized component. """ deserialize_secrets_inplace(data["init_parameters"], ["supabase_key"]) - return default_from_dict(cls, data) \ No newline at end of file + return default_from_dict(cls, data) diff --git a/integrations/supabase/tests/test_groonga_document_store.py b/integrations/supabase/tests/test_groonga_document_store.py index f652344dac..db9e1a603f 100644 --- a/integrations/supabase/tests/test_groonga_document_store.py +++ b/integrations/supabase/tests/test_groonga_document_store.py @@ -6,6 +6,7 @@ import pytest from haystack.dataclasses import Document +from haystack.document_stores.errors import DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy from haystack_integrations.components.retrievers.supabase import SupabaseGroongaRetriever @@ -162,8 +163,6 @@ def test_write_documents_skip(groonga_store, mock_supabase_client): def test_write_documents_fail_on_duplicate(groonga_store, mock_supabase_client): """Test that FAIL policy raises error on duplicate.""" - from haystack.document_stores.errors import DuplicateDocumentError - mock_table = mock_supabase_client.table.return_value # simulate document already exists mock_table.select.return_value.eq.return_value.execute.return_value = MagicMock(data=[{"id": "existing"}]) @@ -350,4 +349,4 @@ def test_retriever_from_dict(mock_supabase_client, monkeypatch): # noqa: ARG001 }, } retriever = SupabaseGroongaRetriever.from_dict(data) - assert retriever.top_k == 7 \ No newline at end of file + assert retriever.top_k == 7 From d67997599266825b9e5fdf087c214f5072994808 Mon Sep 17 00:00:00 2001 From: ShubhamGond105 Date: Tue, 19 May 2026 22:01:56 +0530 Subject: [PATCH 12/34] fix(supabase): fix mypy type errors - CountMethod and union-attr --- .../supabase/groonga_document_store.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py index 522a21b406..ccf941d6fc 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -9,6 +9,7 @@ from haystack.document_stores.errors import DuplicateDocumentError from haystack.document_stores.types import DocumentStore, DuplicatePolicy from haystack.utils.auth import Secret, deserialize_secrets_inplace +from postgrest import CountMethod from supabase import Client, create_client @@ -123,7 +124,7 @@ def count_documents(self) -> int: if self._client is None: msg = "Call warm_up() before using the document store." raise RuntimeError(msg) - result = self._client.table(self.table_name).select("*", count="exact").execute() + result = self._client.table(self.table_name).select("*", count=CountMethod.exact).execute() return int(result.count) if result.count is not None else 0 def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Document]: @@ -170,12 +171,12 @@ def _apply_filters(self, query: Any, filters: dict[str, Any]) -> Any: query = query.eq(f"meta->>'{meta_key}'", value) elif op == "!=": query = query.neq(f"meta->>'{meta_key}'", value) - elif op == "==": - query = query.eq(field, value) - elif op == "!=": - query = query.neq(field, value) - elif op == "in": - query = query.in_(field, value) + elif op == "==": + query = query.eq(field, value) + elif op == "!=": + query = query.neq(field, value) + elif op == "in": + query = query.in_(field, value) return query @@ -264,7 +265,8 @@ def _groonga_retrieval( {"query_text": query, "table": self.table_name, "top_k": top_k}, ).execute() - documents = [self._to_haystack_document(row) for row in (result.data or []) if isinstance(row, dict)] + data = result.data if isinstance(result.data, list) else [] + documents = [self._to_haystack_document(row) for row in data if isinstance(row, dict)] # Apply filters post-retrieval if provided if filters: From 78231ce8efa05772fc44479713c2d59c4a707fa2 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 20 May 2026 10:55:59 +0200 Subject: [PATCH 13/34] converting methods to static --- .../document_stores/supabase/groonga_document_store.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py index ccf941d6fc..a295bdeee2 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -144,12 +144,13 @@ def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Docume query = self._client.table(self.table_name).select("*") if filters: - query = self._apply_filters(query, filters) + query = SupabaseGroongaDocumentStore._apply_filters(query, filters) result = query.execute() return [self._to_haystack_document(row) for row in result.data if isinstance(row, dict)] - def _apply_filters(self, query: Any, filters: dict[str, Any]) -> Any: + @staticmethod + def _apply_filters(query: Any, filters: dict[str, Any]) -> Any: """ Applies filters to a Supabase query. @@ -270,11 +271,12 @@ def _groonga_retrieval( # Apply filters post-retrieval if provided if filters: - documents = self._filter_documents_in_memory(documents, filters) + documents = SupabaseGroongaDocumentStore._filter_documents_in_memory(documents, filters) return documents - def _filter_documents_in_memory(self, documents: list[Document], filters: dict[str, Any]) -> list[Document]: + @staticmethod + def _filter_documents_in_memory(documents: list[Document], filters: dict[str, Any]) -> list[Document]: """ Filters a list of documents in memory based on the given filters. From 334744f0c6ed9bca870a0a27fd1245e2051e87f0 Mon Sep 17 00:00:00 2001 From: ShubhamGond105 Date: Wed, 20 May 2026 22:16:18 +0530 Subject: [PATCH 14/34] fix(supabase): fix groonga_search parameter name and add integration tests --- .../supabase/groonga_document_store.py | 5 +- .../tests/test_groonga_integration.py | 170 ++++++++++++++++++ 2 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 integrations/supabase/tests/test_groonga_integration.py diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py index ccf941d6fc..e8b610d572 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -124,7 +124,10 @@ def count_documents(self) -> int: if self._client is None: msg = "Call warm_up() before using the document store." raise RuntimeError(msg) - result = self._client.table(self.table_name).select("*", count=CountMethod.exact).execute() + result = self._client.rpc( + "groonga_search", + {"query_text": query, "table_name": self.table_name, "top_k": top_k}, + ).execute() return int(result.count) if result.count is not None else 0 def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Document]: diff --git a/integrations/supabase/tests/test_groonga_integration.py b/integrations/supabase/tests/test_groonga_integration.py new file mode 100644 index 0000000000..9b760b5c4c --- /dev/null +++ b/integrations/supabase/tests/test_groonga_integration.py @@ -0,0 +1,170 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import os + +import pytest +from haystack.dataclasses import Document +from haystack.document_stores.errors import DuplicateDocumentError +from haystack.document_stores.types import DuplicatePolicy +from haystack.utils import Secret + +from haystack_integrations.components.retrievers.supabase import SupabaseGroongaRetriever +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore + + +@pytest.fixture() +def document_store(): + """ + Creates a real SupabaseGroongaDocumentStore connected to a test Supabase project. + Requires SUPABASE_URL and SUPABASE_SERVICE_KEY environment variables to be set. + """ + store = SupabaseGroongaDocumentStore( + supabase_url=os.environ["SUPABASE_URL"], + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_groonga_integration_test", + recreate_table=False, + ) + store.warm_up() + yield store + # Cleanup after each test + all_docs = store.filter_documents() + if all_docs: + store.delete_documents([doc.id for doc in all_docs]) + + +@pytest.mark.integration +class TestSupabaseGroongaDocumentStoreIntegration: + def test_count_empty(self, document_store): + """Test count is zero for an empty document store.""" + assert document_store.count_documents() == 0 + + def test_count_not_empty(self, document_store): + """Test count is correct after writing documents.""" + docs = [ + Document(content="test doc 1"), + Document(content="test doc 2"), + Document(content="test doc 3"), + ] + document_store.write_documents(docs) + assert document_store.count_documents() == 3 + + def test_write_and_filter_documents(self, document_store): + """Test writing and retrieving documents.""" + docs = [ + Document(content="Python is a programming language"), + Document(content="Haystack is a RAG framework"), + ] + written = document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + assert written == 2 + + all_docs = document_store.filter_documents() + assert len(all_docs) == 2 + contents = {doc.content for doc in all_docs} + assert "Python is a programming language" in contents + assert "Haystack is a RAG framework" in contents + + def test_write_documents_duplicate_fail(self, document_store): + """Test write_documents fails with DuplicatePolicy.FAIL on duplicate.""" + doc = Document(content="test doc") + document_store.write_documents([doc], policy=DuplicatePolicy.FAIL) + + with pytest.raises(DuplicateDocumentError): + document_store.write_documents([doc], policy=DuplicatePolicy.FAIL) + + def test_write_documents_duplicate_skip(self, document_store): + """Test write_documents skips duplicate with DuplicatePolicy.SKIP.""" + doc = Document(content="test doc") + assert document_store.write_documents([doc], policy=DuplicatePolicy.SKIP) == 1 + assert document_store.write_documents([doc], policy=DuplicatePolicy.SKIP) == 0 + assert document_store.count_documents() == 1 + + def test_write_documents_duplicate_overwrite(self, document_store): + """Test write_documents overwrites with DuplicatePolicy.OVERWRITE.""" + doc1 = Document(id="test-id-1", content="original content") + doc2 = Document(id="test-id-1", content="updated content") + + document_store.write_documents([doc1], policy=DuplicatePolicy.OVERWRITE) + document_store.write_documents([doc2], policy=DuplicatePolicy.OVERWRITE) + + all_docs = document_store.filter_documents() + assert len(all_docs) == 1 + assert all_docs[0].content == "updated content" + + def test_delete_documents(self, document_store): + """Test deleting documents by ID.""" + docs = [ + Document(content="doc to delete"), + Document(content="doc to keep"), + ] + document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + assert document_store.count_documents() == 2 + + document_store.delete_documents([docs[0].id]) + assert document_store.count_documents() == 1 + + remaining = document_store.filter_documents() + assert remaining[0].content == "doc to keep" + + def test_delete_documents_empty_list(self, document_store): + """Test deleting with empty list does nothing.""" + doc = Document(content="test doc") + document_store.write_documents([doc], policy=DuplicatePolicy.OVERWRITE) + document_store.delete_documents([]) + assert document_store.count_documents() == 1 + + def test_write_documents_with_meta(self, document_store): + """Test writing documents with metadata.""" + docs = [ + Document(content="Python tutorial", meta={"language": "en", "topic": "programming"}), + Document(content="French cooking", meta={"language": "fr", "topic": "cooking"}), + ] + document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + all_docs = document_store.filter_documents() + assert len(all_docs) == 2 + meta_map = {doc.content: doc.meta for doc in all_docs} + assert meta_map["Python tutorial"]["language"] == "en" + assert meta_map["French cooking"]["language"] == "fr" + + def test_groonga_retrieval(self, document_store): + """Test full-text search retrieval using PGroonga.""" + docs = [ + Document(content="Python is a great programming language"), + Document(content="Haystack is built for RAG pipelines"), + Document(content="Supabase is a great backend platform"), + ] + document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + results = document_store._groonga_retrieval(query="Python", top_k=5) + assert len(results) >= 1 + assert any("Python" in doc.content for doc in results) + + def test_groonga_retrieval_top_k(self, document_store): + """Test that top_k limits the number of results.""" + docs = [Document(content=f"document about python number {i}") for i in range(5)] + document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + results = document_store._groonga_retrieval(query="python", top_k=2) + assert len(results) <= 2 + + def test_retriever_run(self, document_store): + """Test the retriever returns documents for a query.""" + docs = [ + Document(content="Python programming is fun"), + Document(content="Java is also popular"), + ] + document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + retriever = SupabaseGroongaRetriever(document_store=document_store, top_k=5) + result = retriever.run(query="Python") + + assert "documents" in result + assert len(result["documents"]) >= 1 + assert any("Python" in doc.content for doc in result["documents"]) + + def test_retriever_empty_query(self, document_store): + """Test retriever returns empty list for empty query.""" + retriever = SupabaseGroongaRetriever(document_store=document_store) + result = retriever.run(query="") + assert result == {"documents": []} \ No newline at end of file From 2b882498811e2b0b3b6aeea735d3710d8426d4cb Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 29 May 2026 15:11:59 +0200 Subject: [PATCH 15/34] using apply_filter_policy from haystack document stores --- .../retrievers/supabase/groonga_retriever.py | 16 ++-------------- .../supabase/groonga_document_store.py | 1 + 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py index e71a059f03..36f4526342 100644 --- a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py +++ b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py @@ -8,6 +8,7 @@ from haystack import component, default_from_dict, default_to_dict from haystack.dataclasses import Document from haystack.document_stores.types import FilterPolicy +from haystack.document_stores.types.filter_policy import apply_filter_policy from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore @@ -89,7 +90,7 @@ def run( if not query: return {"documents": []} - merged_filters = self._merge_filters(filters) + merged_filters = apply_filter_policy(self.filter_policy, self.filters, filters) effective_top_k = top_k if top_k is not None else self.top_k documents = self.document_store._groonga_retrieval( @@ -122,19 +123,6 @@ async def run_async( """ return self.run(query=query, filters=filters, top_k=top_k) - def _merge_filters(self, runtime_filters: dict[str, Any] | None) -> dict[str, Any]: - """ - Merges runtime filters with init filters based on filter_policy. - - :param runtime_filters: Filters passed at runtime. - :returns: Merged filters dictionary. - """ - if runtime_filters is not None: - if self.filter_policy == FilterPolicy.MERGE: - return {**self.filters, **runtime_filters} - return runtime_filters - return self.filters - def to_dict(self) -> dict[str, Any]: """ Serializes the component to a dictionary. diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py index a295bdeee2..66f99817f5 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 +import re from typing import Any from haystack import default_from_dict, default_to_dict, logging From 1743aefa24d175ebf842124e3f88294137c34eef Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 29 May 2026 15:12:23 +0200 Subject: [PATCH 16/34] adding table_name validation --- .../document_stores/supabase/groonga_document_store.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py index 66f99817f5..cc475a651a 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -66,6 +66,10 @@ def __init__( :param recreate_table: Whether to drop and recreate the table on startup. Defaults to `False`. """ + if not re.fullmatch(r"[a-zA-Z_][a-zA-Z0-9_]*", table_name): + msg = f"Invalid table_name {table_name!r}: must match [a-zA-Z_][a-zA-Z0-9_]*" + raise ValueError(msg) + self.supabase_url = supabase_url self.supabase_key = supabase_key self.table_name = table_name From 525b8fea008b598937471ceeb74acb5d41fc674a Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 29 May 2026 15:15:18 +0200 Subject: [PATCH 17/34] updating tests --- .../tests/test_groonga_document_store.py | 505 ++++++++---------- 1 file changed, 223 insertions(+), 282 deletions(-) diff --git a/integrations/supabase/tests/test_groonga_document_store.py b/integrations/supabase/tests/test_groonga_document_store.py index db9e1a603f..5d9e6fc33e 100644 --- a/integrations/supabase/tests/test_groonga_document_store.py +++ b/integrations/supabase/tests/test_groonga_document_store.py @@ -24,10 +24,8 @@ def mock_supabase_client(): mock_client = MagicMock() mock_create.return_value = mock_client - # Mock rpc calls (used in _setup_table) mock_client.rpc.return_value.execute.return_value = MagicMock(data=[], count=0) - # Mock table calls mock_table = MagicMock() mock_client.table.return_value = mock_table mock_table.select.return_value = mock_table @@ -50,7 +48,7 @@ def groonga_store(mock_supabase_client, monkeypatch): # noqa: ARG001 table_name="test_groonga_documents", recreate_table=False, ) - store.warm_up() # must call warm_up() before using the store + store.warm_up() return store @@ -59,198 +57,156 @@ def groonga_store(mock_supabase_client, monkeypatch): # noqa: ARG001 # ───────────────────────────────────────────── -def test_init_defaults(monkeypatch): - """Test that default parameters are set correctly without connecting.""" - monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") - store = SupabaseGroongaDocumentStore( - supabase_url="https://fake-project.supabase.co", - ) - assert store.table_name == "haystack_groonga_documents" - assert store.recreate_table is False - assert store.supabase_url == "https://fake-project.supabase.co" - # client should be None before warm_up() - assert store._client is None - - -def test_init_custom_params(monkeypatch): - """Test that custom parameters are set correctly without connecting.""" - monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") - store = SupabaseGroongaDocumentStore( - supabase_url="https://fake-project.supabase.co", - table_name="my_custom_table", - recreate_table=True, - ) - assert store.table_name == "my_custom_table" - assert store.recreate_table is True - # client should be None before warm_up() - assert store._client is None - - -def test_warm_up_initializes_client(mock_supabase_client, monkeypatch): # noqa: ARG001 - """Test that warm_up() initializes the client.""" - monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") - store = SupabaseGroongaDocumentStore( - supabase_url="https://fake-project.supabase.co", - ) - assert store._client is None - store.warm_up() - assert store._client is not None - - -def test_init_invalid_store(): - """Test that passing wrong store to retriever raises ValueError.""" - with pytest.raises(ValueError, match="document_store must be an instance"): - SupabaseGroongaRetriever(document_store="not_a_store") - - -def test_count_documents(groonga_store, mock_supabase_client): - """Test count_documents returns correct number.""" - mock_supabase_client.table.return_value.select.return_value.execute.return_value = MagicMock(count=5) - count = groonga_store.count_documents() - assert count == 5 - - -def test_count_documents_empty(groonga_store, mock_supabase_client): - """Test count_documents returns 0 when store is empty.""" - mock_supabase_client.table.return_value.select.return_value.execute.return_value = MagicMock(count=0) - count = groonga_store.count_documents() - assert count == 0 - - -def test_write_documents(groonga_store, mock_supabase_client): - """Test that write_documents writes correct number of documents.""" - mock_table = mock_supabase_client.table.return_value - # mock select chain so no existing docs are found - mock_table.select.return_value.eq.return_value.execute.return_value = MagicMock(data=[]) - mock_table.insert.return_value.execute.return_value = MagicMock(data=[{}]) - - documents = [ - Document(content="Python is great"), - Document(content="Haystack is a RAG framework"), - ] - # use OVERWRITE to avoid the duplicate check entirely - written = groonga_store.write_documents(documents, policy=DuplicatePolicy.OVERWRITE) - assert written == 2 - - -def test_write_documents_empty(groonga_store): - """Test that writing empty list returns 0.""" - written = groonga_store.write_documents([]) - assert written == 0 - - -def test_write_documents_overwrite(groonga_store, mock_supabase_client): - """Test that overwrite policy uses upsert.""" - mock_table = mock_supabase_client.table.return_value - mock_table.upsert.return_value.execute.return_value = MagicMock(data=[{}]) - - documents = [Document(content="test document")] - written = groonga_store.write_documents(documents, policy=DuplicatePolicy.OVERWRITE) - assert written == 1 - mock_table.upsert.assert_called_once() - - -def test_write_documents_skip(groonga_store, mock_supabase_client): - """Test that skip policy skips existing documents.""" - mock_table = mock_supabase_client.table.return_value - # simulate document already exists - mock_table.select.return_value.eq.return_value.execute.return_value = MagicMock(data=[{"id": "existing"}]) - - documents = [Document(content="already exists")] - written = groonga_store.write_documents(documents, policy=DuplicatePolicy.SKIP) - assert written == 0 - - -def test_write_documents_fail_on_duplicate(groonga_store, mock_supabase_client): - """Test that FAIL policy raises error on duplicate.""" - mock_table = mock_supabase_client.table.return_value - # simulate document already exists - mock_table.select.return_value.eq.return_value.execute.return_value = MagicMock(data=[{"id": "existing"}]) - - doc = Document(content="duplicate doc") - with pytest.raises(DuplicateDocumentError): - groonga_store.write_documents([doc], policy=DuplicatePolicy.FAIL) - - -def test_delete_documents(groonga_store, mock_supabase_client): - """Test that delete_documents calls delete with correct IDs.""" - mock_table = mock_supabase_client.table.return_value - mock_table.delete.return_value.in_.return_value.execute.return_value = MagicMock(data=[]) - - groonga_store.delete_documents(["id1", "id2"]) - mock_table.delete.assert_called_once() - - -def test_delete_documents_empty(groonga_store, mock_supabase_client): - """Test that deleting empty list does nothing.""" - groonga_store.delete_documents([]) - mock_supabase_client.table.return_value.delete.assert_not_called() - - -def test_filter_documents(groonga_store, mock_supabase_client): - """Test that filter_documents returns correct documents.""" - mock_supabase_client.table.return_value.select.return_value.execute.return_value = MagicMock( - data=[ - {"id": "1", "content": "Python is great", "meta": {}, "score": None}, - {"id": "2", "content": "Haystack rocks", "meta": {}, "score": None}, +class TestDocumentStore: + def test_init_defaults(self, monkeypatch): + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + store = SupabaseGroongaDocumentStore(supabase_url="https://fake-project.supabase.co") + assert store.table_name == "haystack_groonga_documents" + assert store.recreate_table is False + assert store.supabase_url == "https://fake-project.supabase.co" + assert store._client is None + + def test_init_custom_params(self, monkeypatch): + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + store = SupabaseGroongaDocumentStore( + supabase_url="https://fake-project.supabase.co", + table_name="my_custom_table", + recreate_table=True, + ) + assert store.table_name == "my_custom_table" + assert store.recreate_table is True + assert store._client is None + + def test_invalid_table_name_raises(self, monkeypatch): + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + with pytest.raises(ValueError, match="Invalid table_name"): + SupabaseGroongaDocumentStore( + supabase_url="https://fake-project.supabase.co", + table_name="bad-name; DROP TABLE users;", + ) + + def test_table_name_with_numbers_allowed(self, monkeypatch): + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + store = SupabaseGroongaDocumentStore( + supabase_url="https://fake-project.supabase.co", + table_name="my_table_123", + ) + assert store.table_name == "my_table_123" + + def test_warm_up_initializes_client(self, mock_supabase_client, monkeypatch): # noqa: ARG002 + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + store = SupabaseGroongaDocumentStore(supabase_url="https://fake-project.supabase.co") + assert store._client is None + store.warm_up() + assert store._client is not None + + def test_count_documents(self, groonga_store, mock_supabase_client): + mock_supabase_client.table.return_value.select.return_value.execute.return_value = MagicMock(count=5) + assert groonga_store.count_documents() == 5 + + def test_count_documents_empty(self, groonga_store, mock_supabase_client): + mock_supabase_client.table.return_value.select.return_value.execute.return_value = MagicMock(count=0) + assert groonga_store.count_documents() == 0 + + def test_write_documents(self, groonga_store, mock_supabase_client): + mock_table = mock_supabase_client.table.return_value + mock_table.select.return_value.eq.return_value.execute.return_value = MagicMock(data=[]) + mock_table.insert.return_value.execute.return_value = MagicMock(data=[{}]) + + documents = [ + Document(content="Python is great"), + Document(content="Haystack is a RAG framework"), ] - ) - docs = groonga_store.filter_documents() - assert len(docs) == 2 - assert docs[0].content == "Python is great" - assert docs[1].content == "Haystack rocks" - - -def test_filter_documents_with_filters(groonga_store, mock_supabase_client): - """Test that filter_documents applies filters correctly.""" - mock_table = mock_supabase_client.table.return_value - mock_table.select.return_value.eq.return_value.execute.return_value = MagicMock( - data=[ - {"id": "1", "content": "Python is great", "meta": {"language": "en"}, "score": None}, - ] - ) - filters = {"conditions": [{"field": "meta.language", "operator": "==", "value": "en"}]} - docs = groonga_store.filter_documents(filters=filters) - assert len(docs) == 1 - - -# ───────────────────────────────────────────── -# SERIALIZATION TESTS -# ───────────────────────────────────────────── - - -def test_to_dict(groonga_store): - """Test that to_dict returns correct dictionary.""" - result = groonga_store.to_dict() - assert result["type"] == ( - "haystack_integrations.document_stores.supabase.groonga_document_store.SupabaseGroongaDocumentStore" - ) - assert result["init_parameters"]["table_name"] == "test_groonga_documents" - assert result["init_parameters"]["supabase_url"] == "https://fake-project.supabase.co" - assert result["init_parameters"]["recreate_table"] is False - - -def test_from_dict(mock_supabase_client, monkeypatch): # noqa: ARG001 - """Test that from_dict recreates the store correctly.""" - monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") - data = { - "type": ("haystack_integrations.document_stores.supabase.groonga_document_store.SupabaseGroongaDocumentStore"), - "init_parameters": { - "supabase_url": "https://fake-project.supabase.co", - "supabase_key": { - "type": "env_var", - "env_vars": ["SUPABASE_SERVICE_KEY"], - "strict": True, + written = groonga_store.write_documents(documents, policy=DuplicatePolicy.OVERWRITE) + assert written == 2 + + def test_write_documents_empty(self, groonga_store): + assert groonga_store.write_documents([]) == 0 + + def test_write_documents_overwrite(self, groonga_store, mock_supabase_client): + mock_table = mock_supabase_client.table.return_value + mock_table.upsert.return_value.execute.return_value = MagicMock(data=[{}]) + + written = groonga_store.write_documents([Document(content="test document")], policy=DuplicatePolicy.OVERWRITE) + assert written == 1 + mock_table.upsert.assert_called_once() + + def test_write_documents_skip_existing(self, groonga_store, mock_supabase_client): + mock_table = mock_supabase_client.table.return_value + mock_table.select.return_value.eq.return_value.execute.return_value = MagicMock(data=[{"id": "existing"}]) + + written = groonga_store.write_documents([Document(content="already exists")], policy=DuplicatePolicy.SKIP) + assert written == 0 + + def test_write_documents_fail_on_duplicate(self, groonga_store, mock_supabase_client): + mock_table = mock_supabase_client.table.return_value + mock_table.select.return_value.eq.return_value.execute.return_value = MagicMock(data=[{"id": "existing"}]) + + with pytest.raises(DuplicateDocumentError): + groonga_store.write_documents([Document(content="duplicate doc")], policy=DuplicatePolicy.FAIL) + + def test_delete_documents(self, groonga_store, mock_supabase_client): + mock_table = mock_supabase_client.table.return_value + mock_table.delete.return_value.in_.return_value.execute.return_value = MagicMock(data=[]) + + groonga_store.delete_documents(["id1", "id2"]) + mock_table.delete.assert_called_once() + + def test_delete_documents_empty(self, groonga_store, mock_supabase_client): + groonga_store.delete_documents([]) + mock_supabase_client.table.return_value.delete.assert_not_called() + + def test_filter_documents(self, groonga_store, mock_supabase_client): + mock_supabase_client.table.return_value.select.return_value.execute.return_value = MagicMock( + data=[ + {"id": "1", "content": "Python is great", "meta": {}, "score": None}, + {"id": "2", "content": "Haystack rocks", "meta": {}, "score": None}, + ] + ) + docs = groonga_store.filter_documents() + assert len(docs) == 2 + assert docs[0].content == "Python is great" + assert docs[1].content == "Haystack rocks" + + def test_filter_documents_with_filters(self, groonga_store, mock_supabase_client): + mock_table = mock_supabase_client.table.return_value + mock_table.select.return_value.eq.return_value.execute.return_value = MagicMock( + data=[{"id": "1", "content": "Python is great", "meta": {"language": "en"}, "score": None}] + ) + filters = {"conditions": [{"field": "meta.language", "operator": "==", "value": "en"}]} + docs = groonga_store.filter_documents(filters=filters) + assert len(docs) == 1 + + def test_to_dict(self, groonga_store): + result = groonga_store.to_dict() + assert result["type"] == ( + "haystack_integrations.document_stores.supabase.groonga_document_store.SupabaseGroongaDocumentStore" + ) + assert result["init_parameters"]["table_name"] == "test_groonga_documents" + assert result["init_parameters"]["supabase_url"] == "https://fake-project.supabase.co" + assert result["init_parameters"]["recreate_table"] is False + + def test_from_dict(self, mock_supabase_client, monkeypatch): # noqa: ARG002 + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + data = { + "type": ( + "haystack_integrations.document_stores.supabase.groonga_document_store.SupabaseGroongaDocumentStore" + ), + "init_parameters": { + "supabase_url": "https://fake-project.supabase.co", + "supabase_key": { + "type": "env_var", + "env_vars": ["SUPABASE_SERVICE_KEY"], + "strict": True, + }, + "table_name": "test_groonga_documents", + "recreate_table": False, }, - "table_name": "test_groonga_documents", - "recreate_table": False, - }, - } - store = SupabaseGroongaDocumentStore.from_dict(data) - assert store.table_name == "test_groonga_documents" - assert store.supabase_url == "https://fake-project.supabase.co" - # client should be None — warm_up() not called yet - assert store._client is None + } + store = SupabaseGroongaDocumentStore.from_dict(data) + assert store.table_name == "test_groonga_documents" + assert store.supabase_url == "https://fake-project.supabase.co" + assert store._client is None # ───────────────────────────────────────────── @@ -258,95 +214,80 @@ def test_from_dict(mock_supabase_client, monkeypatch): # noqa: ARG001 # ───────────────────────────────────────────── -def test_retriever_init(groonga_store): - """Test that retriever initializes correctly.""" - retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) - assert retriever.top_k == 5 - assert retriever.document_store == groonga_store - - -def test_retriever_init_default_top_k(groonga_store): - """Test that retriever default top_k is 10.""" - retriever = SupabaseGroongaRetriever(document_store=groonga_store) - assert retriever.top_k == 10 - - -def test_retriever_run_empty_query(groonga_store): - """Test that empty query returns empty documents.""" - retriever = SupabaseGroongaRetriever(document_store=groonga_store) - result = retriever.run(query="") - assert result == {"documents": []} - - -def test_retriever_run(groonga_store, mock_supabase_client): - """Test that retriever run calls document store correctly.""" - mock_supabase_client.rpc.return_value.execute.return_value = MagicMock( - data=[ - {"id": "1", "content": "Python is great", "meta": {}, "score": 1.0}, - ] - ) - retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) - result = retriever.run(query="Python") - assert "documents" in result - assert len(result["documents"]) == 1 - assert result["documents"][0].content == "Python is great" - - -@pytest.mark.asyncio -async def test_retriever_run_async(groonga_store, mock_supabase_client): - """Test that async retriever run returns same result as sync run.""" - mock_supabase_client.rpc.return_value.execute.return_value = MagicMock( - data=[ - {"id": "1", "content": "Python is great", "meta": {}, "score": 1.0}, - ] - ) - retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) - result = await retriever.run_async(query="Python") - assert "documents" in result - assert len(result["documents"]) == 1 - - -@pytest.mark.asyncio -async def test_retriever_run_async_empty_query(groonga_store): - """Test that empty query in async run returns empty documents.""" - retriever = SupabaseGroongaRetriever(document_store=groonga_store) - result = await retriever.run_async(query="") - assert result == {"documents": []} - - -def test_retriever_to_dict(groonga_store): - """Test that retriever serializes correctly.""" - retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) - result = retriever.to_dict() - assert result["init_parameters"]["top_k"] == 5 - assert "document_store" in result["init_parameters"] - - -def test_retriever_from_dict(mock_supabase_client, monkeypatch): # noqa: ARG001 - """Test that retriever deserializes correctly.""" - monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") - data = { - "type": ("haystack_integrations.components.retrievers.supabase.groonga_retriever.SupabaseGroongaRetriever"), - "init_parameters": { - "top_k": 7, - "filters": {}, - "filter_policy": "replace", - "document_store": { - "type": ( - "haystack_integrations.document_stores.supabase.groonga_document_store.SupabaseGroongaDocumentStore" - ), - "init_parameters": { - "supabase_url": "https://fake-project.supabase.co", - "supabase_key": { - "type": "env_var", - "env_vars": ["SUPABASE_SERVICE_KEY"], - "strict": True, +class TestRetriever: + def test_init_invalid_store(self): + with pytest.raises(ValueError, match="document_store must be an instance"): + SupabaseGroongaRetriever(document_store="not_a_store") + + def test_init(self, groonga_store): + retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) + assert retriever.top_k == 5 + assert retriever.document_store is groonga_store + + def test_init_default_top_k(self, groonga_store): + retriever = SupabaseGroongaRetriever(document_store=groonga_store) + assert retriever.top_k == 10 + + def test_run_empty_query(self, groonga_store): + retriever = SupabaseGroongaRetriever(document_store=groonga_store) + assert retriever.run(query="") == {"documents": []} + + def test_run(self, groonga_store, mock_supabase_client): + mock_supabase_client.rpc.return_value.execute.return_value = MagicMock( + data=[{"id": "1", "content": "Python is great", "meta": {}, "score": 1.0}] + ) + retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) + result = retriever.run(query="Python") + assert len(result["documents"]) == 1 + assert result["documents"][0].content == "Python is great" + + @pytest.mark.asyncio + async def test_run_async(self, groonga_store, mock_supabase_client): + mock_supabase_client.rpc.return_value.execute.return_value = MagicMock( + data=[{"id": "1", "content": "Python is great", "meta": {}, "score": 1.0}] + ) + retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) + result = await retriever.run_async(query="Python") + assert len(result["documents"]) == 1 + + @pytest.mark.asyncio + async def test_run_async_empty_query(self, groonga_store): + retriever = SupabaseGroongaRetriever(document_store=groonga_store) + assert await retriever.run_async(query="") == {"documents": []} + + def test_to_dict(self, groonga_store): + retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) + result = retriever.to_dict() + assert result["init_parameters"]["top_k"] == 5 + assert "document_store" in result["init_parameters"] + + def test_from_dict(self, mock_supabase_client, monkeypatch): # noqa: ARG002 + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + data = { + "type": ( + "haystack_integrations.components.retrievers.supabase.groonga_retriever.SupabaseGroongaRetriever" + ), + "init_parameters": { + "top_k": 7, + "filters": {}, + "filter_policy": "replace", + "document_store": { + "type": ( + "haystack_integrations.document_stores.supabase" + ".groonga_document_store.SupabaseGroongaDocumentStore" + ), + "init_parameters": { + "supabase_url": "https://fake-project.supabase.co", + "supabase_key": { + "type": "env_var", + "env_vars": ["SUPABASE_SERVICE_KEY"], + "strict": True, + }, + "table_name": "test_groonga_documents", + "recreate_table": False, }, - "table_name": "test_groonga_documents", - "recreate_table": False, }, }, - }, - } - retriever = SupabaseGroongaRetriever.from_dict(data) - assert retriever.top_k == 7 + } + retriever = SupabaseGroongaRetriever.from_dict(data) + assert retriever.top_k == 7 From 9e68df76f26f007ce017710c9e8b9a7fdf3b3295 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 29 May 2026 15:20:04 +0200 Subject: [PATCH 18/34] decoupling tests --- .../tests/test_groonga_document_store.py | 90 ------------- .../supabase/tests/test_groonga_retriever.py | 127 ++++++++++++++++++ 2 files changed, 127 insertions(+), 90 deletions(-) create mode 100644 integrations/supabase/tests/test_groonga_retriever.py diff --git a/integrations/supabase/tests/test_groonga_document_store.py b/integrations/supabase/tests/test_groonga_document_store.py index 5d9e6fc33e..599a6e91ab 100644 --- a/integrations/supabase/tests/test_groonga_document_store.py +++ b/integrations/supabase/tests/test_groonga_document_store.py @@ -9,14 +9,8 @@ from haystack.document_stores.errors import DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy -from haystack_integrations.components.retrievers.supabase import SupabaseGroongaRetriever from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore -# ───────────────────────────────────────────── -# FIXTURES -# ───────────────────────────────────────────── - - @pytest.fixture def mock_supabase_client(): """Creates a mock Supabase client so we never hit a real database.""" @@ -207,87 +201,3 @@ def test_from_dict(self, mock_supabase_client, monkeypatch): # noqa: ARG002 assert store.table_name == "test_groonga_documents" assert store.supabase_url == "https://fake-project.supabase.co" assert store._client is None - - -# ───────────────────────────────────────────── -# RETRIEVER TESTS -# ───────────────────────────────────────────── - - -class TestRetriever: - def test_init_invalid_store(self): - with pytest.raises(ValueError, match="document_store must be an instance"): - SupabaseGroongaRetriever(document_store="not_a_store") - - def test_init(self, groonga_store): - retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) - assert retriever.top_k == 5 - assert retriever.document_store is groonga_store - - def test_init_default_top_k(self, groonga_store): - retriever = SupabaseGroongaRetriever(document_store=groonga_store) - assert retriever.top_k == 10 - - def test_run_empty_query(self, groonga_store): - retriever = SupabaseGroongaRetriever(document_store=groonga_store) - assert retriever.run(query="") == {"documents": []} - - def test_run(self, groonga_store, mock_supabase_client): - mock_supabase_client.rpc.return_value.execute.return_value = MagicMock( - data=[{"id": "1", "content": "Python is great", "meta": {}, "score": 1.0}] - ) - retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) - result = retriever.run(query="Python") - assert len(result["documents"]) == 1 - assert result["documents"][0].content == "Python is great" - - @pytest.mark.asyncio - async def test_run_async(self, groonga_store, mock_supabase_client): - mock_supabase_client.rpc.return_value.execute.return_value = MagicMock( - data=[{"id": "1", "content": "Python is great", "meta": {}, "score": 1.0}] - ) - retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) - result = await retriever.run_async(query="Python") - assert len(result["documents"]) == 1 - - @pytest.mark.asyncio - async def test_run_async_empty_query(self, groonga_store): - retriever = SupabaseGroongaRetriever(document_store=groonga_store) - assert await retriever.run_async(query="") == {"documents": []} - - def test_to_dict(self, groonga_store): - retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) - result = retriever.to_dict() - assert result["init_parameters"]["top_k"] == 5 - assert "document_store" in result["init_parameters"] - - def test_from_dict(self, mock_supabase_client, monkeypatch): # noqa: ARG002 - monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") - data = { - "type": ( - "haystack_integrations.components.retrievers.supabase.groonga_retriever.SupabaseGroongaRetriever" - ), - "init_parameters": { - "top_k": 7, - "filters": {}, - "filter_policy": "replace", - "document_store": { - "type": ( - "haystack_integrations.document_stores.supabase" - ".groonga_document_store.SupabaseGroongaDocumentStore" - ), - "init_parameters": { - "supabase_url": "https://fake-project.supabase.co", - "supabase_key": { - "type": "env_var", - "env_vars": ["SUPABASE_SERVICE_KEY"], - "strict": True, - }, - "table_name": "test_groonga_documents", - "recreate_table": False, - }, - }, - }, - } - retriever = SupabaseGroongaRetriever.from_dict(data) - assert retriever.top_k == 7 diff --git a/integrations/supabase/tests/test_groonga_retriever.py b/integrations/supabase/tests/test_groonga_retriever.py new file mode 100644 index 0000000000..edfbd95a47 --- /dev/null +++ b/integrations/supabase/tests/test_groonga_retriever.py @@ -0,0 +1,127 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import MagicMock, patch + +import pytest +from haystack.dataclasses import Document +from haystack.document_stores.errors import DuplicateDocumentError +from haystack.document_stores.types import DuplicatePolicy + +from haystack_integrations.components.retrievers.supabase import SupabaseGroongaRetriever +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore + + +@pytest.fixture +def mock_supabase_client(): + """Creates a mock Supabase client so we never hit a real database.""" + with patch("haystack_integrations.document_stores.supabase.groonga_document_store.create_client") as mock_create: + mock_client = MagicMock() + mock_create.return_value = mock_client + + mock_client.rpc.return_value.execute.return_value = MagicMock(data=[], count=0) + + mock_table = MagicMock() + mock_client.table.return_value = mock_table + mock_table.select.return_value = mock_table + mock_table.insert.return_value = mock_table + mock_table.upsert.return_value = mock_table + mock_table.delete.return_value = mock_table + mock_table.eq.return_value = mock_table + mock_table.in_.return_value = mock_table + mock_table.execute.return_value = MagicMock(data=[], count=0) + + yield mock_client + + +@pytest.fixture +def groonga_store(mock_supabase_client, monkeypatch): # noqa: ARG001 + """Creates a SupabaseGroongaDocumentStore with mocked client and calls warm_up().""" + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + store = SupabaseGroongaDocumentStore( + supabase_url="https://fake-project.supabase.co", + table_name="test_groonga_documents", + recreate_table=False, + ) + store.warm_up() + return store + + +class TestRetriever: + def test_init_invalid_store(self): + with pytest.raises(ValueError, match="document_store must be an instance"): + SupabaseGroongaRetriever(document_store="not_a_store") + + def test_init(self, groonga_store): + retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) + assert retriever.top_k == 5 + assert retriever.document_store is groonga_store + + def test_init_default_top_k(self, groonga_store): + retriever = SupabaseGroongaRetriever(document_store=groonga_store) + assert retriever.top_k == 10 + + def test_run_empty_query(self, groonga_store): + retriever = SupabaseGroongaRetriever(document_store=groonga_store) + assert retriever.run(query="") == {"documents": []} + + def test_run(self, groonga_store, mock_supabase_client): + mock_supabase_client.rpc.return_value.execute.return_value = MagicMock( + data=[{"id": "1", "content": "Python is great", "meta": {}, "score": 1.0}] + ) + retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) + result = retriever.run(query="Python") + assert len(result["documents"]) == 1 + assert result["documents"][0].content == "Python is great" + + @pytest.mark.asyncio + async def test_run_async(self, groonga_store, mock_supabase_client): + mock_supabase_client.rpc.return_value.execute.return_value = MagicMock( + data=[{"id": "1", "content": "Python is great", "meta": {}, "score": 1.0}] + ) + retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) + result = await retriever.run_async(query="Python") + assert len(result["documents"]) == 1 + + @pytest.mark.asyncio + async def test_run_async_empty_query(self, groonga_store): + retriever = SupabaseGroongaRetriever(document_store=groonga_store) + assert await retriever.run_async(query="") == {"documents": []} + + def test_to_dict(self, groonga_store): + retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) + result = retriever.to_dict() + assert result["init_parameters"]["top_k"] == 5 + assert "document_store" in result["init_parameters"] + + def test_from_dict(self, mock_supabase_client, monkeypatch): # noqa: ARG002 + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + data = { + "type": ( + "haystack_integrations.components.retrievers.supabase.groonga_retriever.SupabaseGroongaRetriever" + ), + "init_parameters": { + "top_k": 7, + "filters": {}, + "filter_policy": "replace", + "document_store": { + "type": ( + "haystack_integrations.document_stores.supabase" + ".groonga_document_store.SupabaseGroongaDocumentStore" + ), + "init_parameters": { + "supabase_url": "https://fake-project.supabase.co", + "supabase_key": { + "type": "env_var", + "env_vars": ["SUPABASE_SERVICE_KEY"], + "strict": True, + }, + "table_name": "test_groonga_documents", + "recreate_table": False, + }, + }, + }, + } + retriever = SupabaseGroongaRetriever.from_dict(data) + assert retriever.top_k == 7 From 7bc7a91d4779dd66bf1b421478c7d1f9db488708 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 29 May 2026 15:31:23 +0200 Subject: [PATCH 19/34] removing unused imports# --- integrations/supabase/tests/test_groonga_document_store.py | 1 + integrations/supabase/tests/test_groonga_retriever.py | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/integrations/supabase/tests/test_groonga_document_store.py b/integrations/supabase/tests/test_groonga_document_store.py index 599a6e91ab..d7daacc89d 100644 --- a/integrations/supabase/tests/test_groonga_document_store.py +++ b/integrations/supabase/tests/test_groonga_document_store.py @@ -11,6 +11,7 @@ from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore + @pytest.fixture def mock_supabase_client(): """Creates a mock Supabase client so we never hit a real database.""" diff --git a/integrations/supabase/tests/test_groonga_retriever.py b/integrations/supabase/tests/test_groonga_retriever.py index edfbd95a47..666998ccf2 100644 --- a/integrations/supabase/tests/test_groonga_retriever.py +++ b/integrations/supabase/tests/test_groonga_retriever.py @@ -5,9 +5,6 @@ from unittest.mock import MagicMock, patch import pytest -from haystack.dataclasses import Document -from haystack.document_stores.errors import DuplicateDocumentError -from haystack.document_stores.types import DuplicatePolicy from haystack_integrations.components.retrievers.supabase import SupabaseGroongaRetriever from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore From 329a2ec5558ea9b7772dde1d54aa5c9b92dc9769 Mon Sep 17 00:00:00 2001 From: ShubhamGond105 Date: Sun, 31 May 2026 23:55:02 +0530 Subject: [PATCH 20/34] fix(supabase): add strict=False to supabase_key default --- integrations/supabase/pytest | 0 .../document_stores/supabase/groonga_document_store.py | 7 ++----- integrations/supabase/tests/test_groonga_integration.py | 3 ++- 3 files changed, 4 insertions(+), 6 deletions(-) create mode 100644 integrations/supabase/pytest diff --git a/integrations/supabase/pytest b/integrations/supabase/pytest new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py index a8970bd5a5..3e0cd8c172 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -48,7 +48,7 @@ def __init__( self, *, supabase_url: str, - supabase_key: Secret = Secret.from_env_var("SUPABASE_SERVICE_KEY"), + supabase_key: Secret = Secret.from_env_var("SUPABASE_SERVICE_KEY", strict=False), table_name: str = "haystack_groonga_documents", recreate_table: bool = False, ) -> None: @@ -129,10 +129,7 @@ def count_documents(self) -> int: if self._client is None: msg = "Call warm_up() before using the document store." raise RuntimeError(msg) - result = self._client.rpc( - "groonga_search", - {"query_text": query, "table_name": self.table_name, "top_k": top_k}, - ).execute() + result = self._client.table(self.table_name).select("*", count=CountMethod.exact).execute() return int(result.count) if result.count is not None else 0 def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Document]: diff --git a/integrations/supabase/tests/test_groonga_integration.py b/integrations/supabase/tests/test_groonga_integration.py index 9b760b5c4c..cef4db6465 100644 --- a/integrations/supabase/tests/test_groonga_integration.py +++ b/integrations/supabase/tests/test_groonga_integration.py @@ -167,4 +167,5 @@ def test_retriever_empty_query(self, document_store): """Test retriever returns empty list for empty query.""" retriever = SupabaseGroongaRetriever(document_store=document_store) result = retriever.run(query="") - assert result == {"documents": []} \ No newline at end of file + assert result == {"documents": []} + \ No newline at end of file From f789808424d5ee84e1b6f75d0dcc2838a7f7c4f9 Mon Sep 17 00:00:00 2001 From: ShubhamGond105 Date: Mon, 1 Jun 2026 00:03:47 +0530 Subject: [PATCH 21/34] fix(supabase): skip integration tests when SUPABASE_URL not set --- integrations/supabase/tests/test_groonga_integration.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/integrations/supabase/tests/test_groonga_integration.py b/integrations/supabase/tests/test_groonga_integration.py index cef4db6465..e82fe89401 100644 --- a/integrations/supabase/tests/test_groonga_integration.py +++ b/integrations/supabase/tests/test_groonga_integration.py @@ -20,8 +20,12 @@ def document_store(): Creates a real SupabaseGroongaDocumentStore connected to a test Supabase project. Requires SUPABASE_URL and SUPABASE_SERVICE_KEY environment variables to be set. """ + supabase_url = os.environ.get("SUPABASE_URL") + if not supabase_url: + pytest.skip("SUPABASE_URL not set") + store = SupabaseGroongaDocumentStore( - supabase_url=os.environ["SUPABASE_URL"], + supabase_url=supabase_url, supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), table_name="haystack_groonga_integration_test", recreate_table=False, @@ -168,4 +172,3 @@ def test_retriever_empty_query(self, document_store): retriever = SupabaseGroongaRetriever(document_store=document_store) result = retriever.run(query="") assert result == {"documents": []} - \ No newline at end of file From 91d6ee0327bd7f8e96cfc9fbc100fadec3aa00ed Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 1 Jun 2026 11:36:03 +0200 Subject: [PATCH 22/34] fixing pyproject.toml --- integrations/supabase/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/integrations/supabase/pyproject.toml b/integrations/supabase/pyproject.toml index 0c8855f890..fdaf412c09 100644 --- a/integrations/supabase/pyproject.toml +++ b/integrations/supabase/pyproject.toml @@ -154,6 +154,7 @@ show_missing = true exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] [tool.pytest.ini_options] +asyncio_mode = "auto" addopts = "--strict-markers" markers = [ "integration: integration tests", From 5e98a1fd6a0ca7c5a9be849040c3f7b674d3fb5e Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 1 Jun 2026 11:48:23 +0200 Subject: [PATCH 23/34] adding integrations tests --- .../supabase/groonga_document_store.py | 9 ++ .../tests/test_groonga_document_store.py | 7 + .../tests/test_groonga_integration.py | 121 +++--------------- 3 files changed, 33 insertions(+), 104 deletions(-) diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py index 3e0cd8c172..937f8bb0e3 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -234,6 +234,15 @@ def write_documents( return written + def delete_all_documents(self) -> None: + """ + Deletes all documents from the store. + """ + if self._client is None: + msg = "Call warm_up() before using the document store." + raise RuntimeError(msg) + self._client.table(self.table_name).delete().neq("id", "").execute() + def delete_documents(self, document_ids: list[str]) -> None: """ Deletes documents with the given IDs. diff --git a/integrations/supabase/tests/test_groonga_document_store.py b/integrations/supabase/tests/test_groonga_document_store.py index d7daacc89d..4e3255d608 100644 --- a/integrations/supabase/tests/test_groonga_document_store.py +++ b/integrations/supabase/tests/test_groonga_document_store.py @@ -140,6 +140,13 @@ def test_write_documents_fail_on_duplicate(self, groonga_store, mock_supabase_cl with pytest.raises(DuplicateDocumentError): groonga_store.write_documents([Document(content="duplicate doc")], policy=DuplicatePolicy.FAIL) + def test_delete_all_documents(self, groonga_store, mock_supabase_client): + mock_table = mock_supabase_client.table.return_value + mock_table.delete.return_value.neq.return_value.execute.return_value = MagicMock(data=[]) + + groonga_store.delete_all_documents() + mock_table.delete.assert_called_once() + def test_delete_documents(self, groonga_store, mock_supabase_client): mock_table = mock_supabase_client.table.return_value mock_table.delete.return_value.in_.return_value.execute.return_value = MagicMock(data=[]) diff --git a/integrations/supabase/tests/test_groonga_integration.py b/integrations/supabase/tests/test_groonga_integration.py index e82fe89401..2cb33cb0be 100644 --- a/integrations/supabase/tests/test_groonga_integration.py +++ b/integrations/supabase/tests/test_groonga_integration.py @@ -3,11 +3,15 @@ # SPDX-License-Identifier: Apache-2.0 import os +import re import pytest from haystack.dataclasses import Document -from haystack.document_stores.errors import DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy +from haystack.testing.document_store import ( + DeleteAllTest, + DocumentStoreBaseTests, +) from haystack.utils import Secret from haystack_integrations.components.retrievers.supabase import SupabaseGroongaRetriever @@ -15,7 +19,7 @@ @pytest.fixture() -def document_store(): +def document_store(request): """ Creates a real SupabaseGroongaDocumentStore connected to a test Supabase project. Requires SUPABASE_URL and SUPABASE_SERVICE_KEY environment variables to be set. @@ -24,115 +28,28 @@ def document_store(): if not supabase_url: pytest.skip("SUPABASE_URL not set") + safe_name = re.sub(r"[^a-zA-Z0-9]", "_", request.node.name)[:40] + table_name = f"hg_{safe_name}" + store = SupabaseGroongaDocumentStore( supabase_url=supabase_url, supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), - table_name="haystack_groonga_integration_test", - recreate_table=False, + table_name=table_name, + recreate_table=True, ) store.warm_up() yield store - # Cleanup after each test - all_docs = store.filter_documents() - if all_docs: - store.delete_documents([doc.id for doc in all_docs]) + store.delete_all_documents() @pytest.mark.integration -class TestSupabaseGroongaDocumentStoreIntegration: - def test_count_empty(self, document_store): - """Test count is zero for an empty document store.""" - assert document_store.count_documents() == 0 +class TestSupabaseGroongaDocumentStoreIntegration(DocumentStoreBaseTests, DeleteAllTest): + pass - def test_count_not_empty(self, document_store): - """Test count is correct after writing documents.""" - docs = [ - Document(content="test doc 1"), - Document(content="test doc 2"), - Document(content="test doc 3"), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - def test_write_and_filter_documents(self, document_store): - """Test writing and retrieving documents.""" - docs = [ - Document(content="Python is a programming language"), - Document(content="Haystack is a RAG framework"), - ] - written = document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - assert written == 2 - - all_docs = document_store.filter_documents() - assert len(all_docs) == 2 - contents = {doc.content for doc in all_docs} - assert "Python is a programming language" in contents - assert "Haystack is a RAG framework" in contents - - def test_write_documents_duplicate_fail(self, document_store): - """Test write_documents fails with DuplicatePolicy.FAIL on duplicate.""" - doc = Document(content="test doc") - document_store.write_documents([doc], policy=DuplicatePolicy.FAIL) - - with pytest.raises(DuplicateDocumentError): - document_store.write_documents([doc], policy=DuplicatePolicy.FAIL) - - def test_write_documents_duplicate_skip(self, document_store): - """Test write_documents skips duplicate with DuplicatePolicy.SKIP.""" - doc = Document(content="test doc") - assert document_store.write_documents([doc], policy=DuplicatePolicy.SKIP) == 1 - assert document_store.write_documents([doc], policy=DuplicatePolicy.SKIP) == 0 - assert document_store.count_documents() == 1 - - def test_write_documents_duplicate_overwrite(self, document_store): - """Test write_documents overwrites with DuplicatePolicy.OVERWRITE.""" - doc1 = Document(id="test-id-1", content="original content") - doc2 = Document(id="test-id-1", content="updated content") - - document_store.write_documents([doc1], policy=DuplicatePolicy.OVERWRITE) - document_store.write_documents([doc2], policy=DuplicatePolicy.OVERWRITE) - - all_docs = document_store.filter_documents() - assert len(all_docs) == 1 - assert all_docs[0].content == "updated content" - - def test_delete_documents(self, document_store): - """Test deleting documents by ID.""" - docs = [ - Document(content="doc to delete"), - Document(content="doc to keep"), - ] - document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - assert document_store.count_documents() == 2 - - document_store.delete_documents([docs[0].id]) - assert document_store.count_documents() == 1 - - remaining = document_store.filter_documents() - assert remaining[0].content == "doc to keep" - - def test_delete_documents_empty_list(self, document_store): - """Test deleting with empty list does nothing.""" - doc = Document(content="test doc") - document_store.write_documents([doc], policy=DuplicatePolicy.OVERWRITE) - document_store.delete_documents([]) - assert document_store.count_documents() == 1 - - def test_write_documents_with_meta(self, document_store): - """Test writing documents with metadata.""" - docs = [ - Document(content="Python tutorial", meta={"language": "en", "topic": "programming"}), - Document(content="French cooking", meta={"language": "fr", "topic": "cooking"}), - ] - document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - all_docs = document_store.filter_documents() - assert len(all_docs) == 2 - meta_map = {doc.content: doc.meta for doc in all_docs} - assert meta_map["Python tutorial"]["language"] == "en" - assert meta_map["French cooking"]["language"] == "fr" +@pytest.mark.integration +class TestGroongaRetriever: def test_groonga_retrieval(self, document_store): - """Test full-text search retrieval using PGroonga.""" docs = [ Document(content="Python is a great programming language"), Document(content="Haystack is built for RAG pipelines"), @@ -145,7 +62,6 @@ def test_groonga_retrieval(self, document_store): assert any("Python" in doc.content for doc in results) def test_groonga_retrieval_top_k(self, document_store): - """Test that top_k limits the number of results.""" docs = [Document(content=f"document about python number {i}") for i in range(5)] document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) @@ -153,7 +69,6 @@ def test_groonga_retrieval_top_k(self, document_store): assert len(results) <= 2 def test_retriever_run(self, document_store): - """Test the retriever returns documents for a query.""" docs = [ Document(content="Python programming is fun"), Document(content="Java is also popular"), @@ -168,7 +83,5 @@ def test_retriever_run(self, document_store): assert any("Python" in doc.content for doc in result["documents"]) def test_retriever_empty_query(self, document_store): - """Test retriever returns empty list for empty query.""" retriever = SupabaseGroongaRetriever(document_store=document_store) - result = retriever.run(query="") - assert result == {"documents": []} + assert retriever.run(query="") == {"documents": []} From e3c0bfcdcbfd25d4ad41ba0ec2db98d71efcca11 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 1 Jun 2026 11:50:04 +0200 Subject: [PATCH 24/34] adding delete_by_filter and update_by_filter --- .../supabase/groonga_document_store.py | 42 +++++++++++++++++ .../tests/test_groonga_document_store.py | 45 +++++++++++++++++++ .../tests/test_groonga_integration.py | 11 ++++- 3 files changed, 97 insertions(+), 1 deletion(-) diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py index 937f8bb0e3..d388140e51 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -234,6 +234,48 @@ def write_documents( return written + def delete_by_filter(self, filters: dict[str, Any]) -> int: + """ + Deletes documents matching the given filters. + + :param filters: Filters to select documents for deletion. + :returns: Number of documents deleted. + """ + docs = self.filter_documents(filters=filters) + if not docs: + return 0 + self.delete_documents([doc.id for doc in docs]) + return len(docs) + + def update_by_filter(self, filters: dict[str, Any], meta: dict[str, Any]) -> int: + """ + Updates the metadata of documents matching the given filters. + + Provided meta fields are merged into the existing document metadata. + + :param filters: Filters to select documents to update. + :param meta: Metadata fields to set on matching documents. + :returns: Number of documents updated. + """ + if self._client is None: + msg = "Call warm_up() before using the document store." + raise RuntimeError(msg) + + docs = self.filter_documents(filters=filters) + if not docs: + return 0 + + for doc in docs: + row = { + "id": doc.id, + "content": doc.content or "", + "meta": {**doc.meta, **meta}, + "score": None, + } + self._client.table(self.table_name).upsert(row).execute() + + return len(docs) + def delete_all_documents(self) -> None: """ Deletes all documents from the store. diff --git a/integrations/supabase/tests/test_groonga_document_store.py b/integrations/supabase/tests/test_groonga_document_store.py index 4e3255d608..8d98ec1cea 100644 --- a/integrations/supabase/tests/test_groonga_document_store.py +++ b/integrations/supabase/tests/test_groonga_document_store.py @@ -140,6 +140,51 @@ def test_write_documents_fail_on_duplicate(self, groonga_store, mock_supabase_cl with pytest.raises(DuplicateDocumentError): groonga_store.write_documents([Document(content="duplicate doc")], policy=DuplicatePolicy.FAIL) + def test_delete_by_filter(self, groonga_store, mock_supabase_client): + mock_table = mock_supabase_client.table.return_value + mock_table.select.return_value.execute.return_value = MagicMock( + data=[{"id": "1", "content": "doc one", "meta": {"lang": "en"}, "score": None}] + ) + mock_table.delete.return_value.in_.return_value.execute.return_value = MagicMock(data=[]) + + deleted = groonga_store.delete_by_filter( + filters={"conditions": [{"field": "meta.lang", "operator": "==", "value": "en"}]} + ) + assert deleted == 1 + + def test_delete_by_filter_no_matches(self, groonga_store, mock_supabase_client): + mock_supabase_client.table.return_value.select.return_value.execute.return_value = MagicMock(data=[]) + + deleted = groonga_store.delete_by_filter( + filters={"conditions": [{"field": "meta.lang", "operator": "==", "value": "fr"}]} + ) + assert deleted == 0 + + def test_update_by_filter(self, groonga_store, mock_supabase_client): + mock_table = mock_supabase_client.table.return_value + mock_table.select.return_value.execute.return_value = MagicMock( + data=[{"id": "1", "content": "doc one", "meta": {"lang": "en"}, "score": None}] + ) + mock_table.upsert.return_value.execute.return_value = MagicMock(data=[{}]) + + updated = groonga_store.update_by_filter( + filters={"conditions": [{"field": "meta.lang", "operator": "==", "value": "en"}]}, + meta={"reviewed": True}, + ) + assert updated == 1 + mock_table.upsert.assert_called_once() + upserted_row = mock_table.upsert.call_args[0][0] + assert upserted_row["meta"] == {"lang": "en", "reviewed": True} + + def test_update_by_filter_no_matches(self, groonga_store, mock_supabase_client): + mock_supabase_client.table.return_value.select.return_value.execute.return_value = MagicMock(data=[]) + + updated = groonga_store.update_by_filter( + filters={"conditions": [{"field": "meta.lang", "operator": "==", "value": "fr"}]}, + meta={"reviewed": True}, + ) + assert updated == 0 + def test_delete_all_documents(self, groonga_store, mock_supabase_client): mock_table = mock_supabase_client.table.return_value mock_table.delete.return_value.neq.return_value.execute.return_value = MagicMock(data=[]) diff --git a/integrations/supabase/tests/test_groonga_integration.py b/integrations/supabase/tests/test_groonga_integration.py index 2cb33cb0be..85e5232fb8 100644 --- a/integrations/supabase/tests/test_groonga_integration.py +++ b/integrations/supabase/tests/test_groonga_integration.py @@ -10,7 +10,10 @@ from haystack.document_stores.types import DuplicatePolicy from haystack.testing.document_store import ( DeleteAllTest, + DeleteByFilterTest, DocumentStoreBaseTests, + FilterableDocsFixtureMixin, + UpdateByFilterTest, ) from haystack.utils import Secret @@ -43,7 +46,13 @@ def document_store(request): @pytest.mark.integration -class TestSupabaseGroongaDocumentStoreIntegration(DocumentStoreBaseTests, DeleteAllTest): +class TestSupabaseGroongaDocumentStoreIntegration( + DocumentStoreBaseTests, + DeleteAllTest, + DeleteByFilterTest, + FilterableDocsFixtureMixin, + UpdateByFilterTest, +): pass From 3a4dc5503bf0555fea25f8de4e6cecfd388056d4 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 1 Jun 2026 13:15:31 +0200 Subject: [PATCH 25/34] adding Docker instance for integration tests and using mixins --- .github/workflows/supabase.yml | 24 +++- integrations/supabase/Dockerfile.pgroonga | 10 ++ .../supabase/docker-compose-groonga.yml | 42 ++++++ integrations/supabase/init-pgroonga.sql | 62 +++++++++ integrations/supabase/nginx-groonga.conf | 18 +++ .../supabase/groonga_document_store.py | 125 +++++++++++++++--- .../tests/test_groonga_document_store.py | 6 - .../tests/test_groonga_integration.py | 63 ++++++--- 8 files changed, 303 insertions(+), 47 deletions(-) create mode 100644 integrations/supabase/Dockerfile.pgroonga create mode 100644 integrations/supabase/docker-compose-groonga.yml create mode 100644 integrations/supabase/init-pgroonga.sql create mode 100644 integrations/supabase/nginx-groonga.conf diff --git a/.github/workflows/supabase.yml b/.github/workflows/supabase.yml index c2abdf5340..33d2a0b8ed 100644 --- a/.github/workflows/supabase.yml +++ b/.github/workflows/supabase.yml @@ -120,11 +120,31 @@ jobs: name: coverage-comment-supabase path: python-coverage-comment-action-supabase.txt - - name: Run integration tests + - name: Run pgvector integration tests if: runner.os == 'Linux' env: SUPABASE_DB_URL: "postgresql://postgres:postgres@localhost:5432/postgres" - run: hatch run test:integration-cov-append-retry + run: hatch run test:integration-cov-append-retry --ignore=tests/test_groonga_integration.py + + - name: Start PGroonga + PostgREST stack + if: runner.os == 'Linux' + run: docker compose -f docker-compose-groonga.yml up -d --build + + - name: Wait for PGroonga stack to be ready + if: runner.os == 'Linux' + run: | + for i in $(seq 1 30); do + if curl -sf http://localhost:8000/rest/v1/ -H "apikey: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hj04zWl196z2-SBc0"; then + echo "PGroonga stack is ready" + break + fi + echo "Waiting for PGroonga stack... ($i/30)" + sleep 5 + done + + - name: Run PGroonga integration tests + if: runner.os == 'Linux' + run: hatch run test:integration-cov-append-retry tests/test_groonga_integration.py - name: Store combined coverage if: github.event_name == 'push' diff --git a/integrations/supabase/Dockerfile.pgroonga b/integrations/supabase/Dockerfile.pgroonga new file mode 100644 index 0000000000..a6623b41c6 --- /dev/null +++ b/integrations/supabase/Dockerfile.pgroonga @@ -0,0 +1,10 @@ +FROM postgres:17-bookworm + +RUN apt-get update && \ + apt-get install -y wget gnupg2 && \ + wget -q -O /tmp/groonga-apt-source.deb \ + https://packages.groonga.org/debian/groonga-apt-source-latest-bookworm.deb && \ + dpkg -i /tmp/groonga-apt-source.deb && \ + apt-get update && \ + apt-get install -y postgresql-17-pgdg-pgroonga && \ + rm -rf /var/lib/apt/lists/* /tmp/groonga-apt-source.deb diff --git a/integrations/supabase/docker-compose-groonga.yml b/integrations/supabase/docker-compose-groonga.yml new file mode 100644 index 0000000000..4886fb972c --- /dev/null +++ b/integrations/supabase/docker-compose-groonga.yml @@ -0,0 +1,42 @@ +services: + pgroonga-postgres: + build: + context: . + dockerfile: Dockerfile.pgroonga + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: postgres + ports: + - "5433:5432" + volumes: + - ./init-pgroonga.sql:/docker-entrypoint-initdb.d/init-pgroonga.sql + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 10s + timeout: 5s + retries: 10 + + postgrest: + image: postgrest/postgrest:v12.2.0 + environment: + PGRST_DB_URI: postgres://postgres:postgres@pgroonga-postgres:5432/postgres + PGRST_DB_SCHEMAS: public + # No PGRST_JWT_SECRET → JWT validation disabled; all requests run as PGRST_DB_ANON_ROLE. + # supabase-py still sends an apikey header but PostgREST ignores it. + PGRST_DB_ANON_ROLE: postgres + PGRST_LOG_LEVEL: info + ports: + - "3000:3000" + depends_on: + pgroonga-postgres: + condition: service_healthy + + nginx: + image: nginx:alpine + ports: + - "8000:8000" + volumes: + - ./nginx-groonga.conf:/etc/nginx/nginx.conf:ro + depends_on: + - postgrest diff --git a/integrations/supabase/init-pgroonga.sql b/integrations/supabase/init-pgroonga.sql new file mode 100644 index 0000000000..745428b823 --- /dev/null +++ b/integrations/supabase/init-pgroonga.sql @@ -0,0 +1,62 @@ +-- Enable PGroonga extension +CREATE EXTENSION IF NOT EXISTS pgroonga; + +-- PostgreSQL role that PostgREST switches to when a service_role JWT is presented. +-- The role must exist before PostgREST connects. +DO $$ +BEGIN + IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'service_role') THEN + CREATE ROLE service_role NOLOGIN; + END IF; +END +$$; + +GRANT ALL ON SCHEMA public TO service_role; +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO service_role; +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO service_role; +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON FUNCTIONS TO service_role; + +-- exec_sql: allows the document store to create/drop tables and indexes via RPC. +CREATE OR REPLACE FUNCTION exec_sql(query TEXT) +RETURNS VOID AS $$ +BEGIN + EXECUTE query; +END; +$$ LANGUAGE plpgsql SECURITY DEFINER; + +GRANT EXECUTE ON FUNCTION exec_sql(TEXT) TO service_role; + +-- groonga_search: full-text search via PGroonga, called by _groonga_retrieval(). +CREATE OR REPLACE FUNCTION groonga_search(query_text TEXT, table_name TEXT, top_k INT) +RETURNS TABLE(id TEXT, content TEXT, meta JSONB, score REAL) AS $$ +DECLARE + sql TEXT; +BEGIN + sql := format( + 'SELECT id, content, meta, pgroonga_score(tableoid, ctid)::REAL AS score + FROM %I + WHERE content &@~ %L + ORDER BY score DESC + LIMIT %s', + table_name, query_text, top_k + ); + RETURN QUERY EXECUTE sql; +END; +$$ LANGUAGE plpgsql; + +GRANT EXECUTE ON FUNCTION groonga_search(TEXT, TEXT, INT) TO service_role; + +-- Pre-create the test table so PostgREST includes it in its schema cache at startup. +-- Tests use this fixed table and clear data between runs instead of recreating the table. +CREATE TABLE IF NOT EXISTS haystack_groonga_test ( + id TEXT PRIMARY KEY, + content TEXT, + meta JSONB, + score REAL +); + +CREATE INDEX IF NOT EXISTS pgroonga_haystack_groonga_test_index +ON haystack_groonga_test +USING pgroonga (content); + +GRANT ALL ON TABLE haystack_groonga_test TO postgres; diff --git a/integrations/supabase/nginx-groonga.conf b/integrations/supabase/nginx-groonga.conf new file mode 100644 index 0000000000..61ad7f3ee8 --- /dev/null +++ b/integrations/supabase/nginx-groonga.conf @@ -0,0 +1,18 @@ +# Minimal reverse proxy so supabase-py (which appends /rest/v1/) reaches PostgREST. +events {} + +http { + server { + listen 8000; + + location /rest/v1/ { + rewrite ^/rest/v1/(.*)$ /$1 break; + proxy_pass http://postgrest:3000; + proxy_set_header Host $host; + # Strip auth headers — PostgREST has no JWT secret configured, + # so all requests run as the anon role (postgres). + proxy_set_header Authorization ""; + proxy_set_header apikey ""; + } + } +} diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py index d388140e51..0fa5b153c4 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -3,11 +3,13 @@ # SPDX-License-Identifier: Apache-2.0 import re +from datetime import datetime as _datetime from typing import Any from haystack import default_from_dict, default_to_dict, logging from haystack.dataclasses import Document from haystack.document_stores.errors import DuplicateDocumentError +from haystack.errors import FilterError from haystack.document_stores.types import DocumentStore, DuplicatePolicy from haystack.utils.auth import Secret, deserialize_secrets_inplace from postgrest import CountMethod @@ -157,32 +159,107 @@ def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Docume @staticmethod def _apply_filters(query: Any, filters: dict[str, Any]) -> Any: """ - Applies filters to a Supabase query. + Applies Haystack filters to a PostgREST query builder. + + Supports AND logical operator and all standard comparison operators. + OR is supported for simple (non-nested) equality/comparison conditions. :param query: The Supabase query builder. - :param filters: Dictionary of filters to apply. + :param filters: Haystack filter dict. :returns: The query with filters applied. + :raises FilterError: For unsupported operators or invalid value types. """ + if not filters: + return query + + # Simple comparison: {"field": "...", "operator": "...", "value": "..."} + if "field" in filters: + return SupabaseGroongaDocumentStore._apply_condition(query, filters) + + op = filters.get("operator", "AND") conditions = filters.get("conditions", []) - for condition in conditions: - field = condition.get("field", "") - op = condition.get("operator", "==") - value = condition.get("value") - - # Handle nested meta fields e.g. "meta.language" - if field.startswith("meta."): - meta_key = field[len("meta.") :] - if op == "==": - query = query.eq(f"meta->>'{meta_key}'", value) - elif op == "!=": - query = query.neq(f"meta->>'{meta_key}'", value) - elif op == "==": - query = query.eq(field, value) - elif op == "!=": - query = query.neq(field, value) - elif op == "in": - query = query.in_(field, value) + if op == "AND": + for cond in conditions: + query = SupabaseGroongaDocumentStore._apply_filters(query, cond) + return query + + if op == "OR": + parts = [] + for cond in conditions: + if "field" not in cond: + msg = "Nested logical operators inside OR are not supported." + raise FilterError(msg) + parts.append(SupabaseGroongaDocumentStore._condition_to_or_part(cond)) + return query.or_(",".join(parts)) + + msg = f"Filter operator '{op}' is not supported. Supported logical operators: AND, OR." + raise FilterError(msg) + + @staticmethod + def _condition_to_or_part(condition: dict[str, Any]) -> str: + field: str = condition.get("field", "") + op: str = condition.get("operator", "==") + value = condition.get("value") + col = f"meta->>{field[len('meta.'):]}" if field.startswith("meta.") else field + pg_op = {"==": "eq", "!=": "neq", ">": "gt", ">=": "gte", "<": "lt", "<=": "lte"} + if op not in pg_op: + msg = f"Operator '{op}' inside OR filter is not supported." + raise FilterError(msg) + return f"{col}.{pg_op[op]}.{value}" + + @staticmethod + def _apply_condition(query: Any, condition: dict[str, Any]) -> Any: + field: str = condition.get("field", "") + op: str = condition.get("operator", "==") + value = condition.get("value") + + # PostgREST JSONB text accessor: meta->>key (no quotes around key name) + col = f"meta->>{field[len('meta.'):]}" if field.startswith("meta.") else field + + if op == "==": + return query.is_(col, "null") if value is None else query.eq(col, value) + + if op == "!=": + return query.not_.is_(col, "null") if value is None else query.neq(col, value) + + if op in (">", ">=", "<", "<="): + if isinstance(value, list): + msg = f"Filter operator '{op}' does not support list values." + raise FilterError(msg) + if value is None: + # No document satisfies an ordering comparison against NULL. + return query.eq("id", "") + if isinstance(value, str): + try: + _datetime.fromisoformat(value) + except ValueError: + msg = f"Filter operator '{op}' does not support plain string values. Use a numeric or ISO date value." + raise FilterError(msg) + # ISO date strings sort correctly as text; no cast needed. + col_cmp = col + else: + # Numeric value: cast JSONB text to numeric for correct ordering. + col_cmp = f"{col}::numeric" + if op == ">": + return query.gt(col_cmp, value) + if op == ">=": + return query.gte(col_cmp, value) + if op == "<": + return query.lt(col_cmp, value) + return query.lte(col_cmp, value) + + if op == "in": + if not isinstance(value, list): + msg = "Filter operator 'in' requires a list value." + raise FilterError(msg) + return query.in_(col, value) + + if op == "not in": + if not isinstance(value, list): + msg = "Filter operator 'not in' requires a list value." + raise FilterError(msg) + return query.not_.in_(col, value) return query @@ -198,6 +275,14 @@ def write_documents( :param policy: How to handle duplicate documents. Defaults to DuplicatePolicy.FAIL. :returns: Number of documents written. """ + if not isinstance(documents, list): + msg = f"write_documents() expects a list of Document objects, got {type(documents).__name__}" + raise ValueError(msg) + for doc in documents: + if not isinstance(doc, Document): + msg = f"write_documents() expects Document objects, got {type(doc).__name__}" + raise ValueError(msg) + if self._client is None: msg = "Call warm_up() before using the document store." raise RuntimeError(msg) diff --git a/integrations/supabase/tests/test_groonga_document_store.py b/integrations/supabase/tests/test_groonga_document_store.py index 8d98ec1cea..49ec51f00a 100644 --- a/integrations/supabase/tests/test_groonga_document_store.py +++ b/integrations/supabase/tests/test_groonga_document_store.py @@ -46,12 +46,6 @@ def groonga_store(mock_supabase_client, monkeypatch): # noqa: ARG001 store.warm_up() return store - -# ───────────────────────────────────────────── -# DOCUMENT STORE TESTS -# ───────────────────────────────────────────── - - class TestDocumentStore: def test_init_defaults(self, monkeypatch): monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") diff --git a/integrations/supabase/tests/test_groonga_integration.py b/integrations/supabase/tests/test_groonga_integration.py index 85e5232fb8..fd684fe535 100644 --- a/integrations/supabase/tests/test_groonga_integration.py +++ b/integrations/supabase/tests/test_groonga_integration.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 import os -import re import pytest from haystack.dataclasses import Document @@ -20,29 +19,25 @@ from haystack_integrations.components.retrievers.supabase import SupabaseGroongaRetriever from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore +# Defaults for the local Docker stack (docker-compose-groonga.yml). +# PostgREST is configured without a JWT secret, so the key is not validated. +_LOCAL_SUPABASE_URL = "http://localhost:8000" +_LOCAL_SERVICE_KEY = "local-dev-key-not-validated" -@pytest.fixture() -def document_store(request): - """ - Creates a real SupabaseGroongaDocumentStore connected to a test Supabase project. - Requires SUPABASE_URL and SUPABASE_SERVICE_KEY environment variables to be set. - """ - supabase_url = os.environ.get("SUPABASE_URL") - if not supabase_url: - pytest.skip("SUPABASE_URL not set") - - safe_name = re.sub(r"[^a-zA-Z0-9]", "_", request.node.name)[:40] - table_name = f"hg_{safe_name}" +def _make_store(request: pytest.FixtureRequest) -> SupabaseGroongaDocumentStore: # noqa: ARG001 + supabase_url = os.environ.get("SUPABASE_URL", _LOCAL_SUPABASE_URL) + service_key = os.environ.get("SUPABASE_SERVICE_KEY", _LOCAL_SERVICE_KEY) store = SupabaseGroongaDocumentStore( supabase_url=supabase_url, - supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), - table_name=table_name, - recreate_table=True, + supabase_key=Secret.from_token(service_key), + # Fixed table pre-created in init-pgroonga.sql so PostgREST knows about it at startup. + # Tests clear data in teardown instead of recreating the table. + table_name="haystack_groonga_test", + recreate_table=False, ) store.warm_up() - yield store - store.delete_all_documents() + return store @pytest.mark.integration @@ -53,11 +48,41 @@ class TestSupabaseGroongaDocumentStoreIntegration( FilterableDocsFixtureMixin, UpdateByFilterTest, ): - pass + @pytest.fixture + def document_store(self, request): + store = _make_store(request) + yield store + store.delete_all_documents() + + @staticmethod + def assert_documents_are_equal(received: list[Document], expected: list[Document]) -> None: + # Embeddings are not stored; strip them and sort by id for order-independent comparison. + def normalize(doc: Document) -> Document: + return Document(id=doc.id, content=doc.content, meta=doc.meta) + + assert sorted([normalize(d) for d in received], key=lambda d: d.id or "") == sorted( + [normalize(d) for d in expected], key=lambda d: d.id or "" + ) + + def test_write_documents(self, document_store: SupabaseGroongaDocumentStore) -> None: + docs = [ + Document(content="First document", meta={"key": "val"}), + Document(content="Second document"), + ] + assert document_store.write_documents(docs, DuplicatePolicy.FAIL) == len(docs) + result = document_store.filter_documents() + self.assert_documents_are_equal(result, docs) @pytest.mark.integration class TestGroongaRetriever: + @pytest.fixture + def document_store(self, request): + store = _make_store(request) + yield store + store.delete_all_documents() + + def test_groonga_retrieval(self, document_store): docs = [ Document(content="Python is a great programming language"), From a8a9e3b9e6716f08987ac701b0675cf4a9b51dfd Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 1 Jun 2026 13:23:47 +0200 Subject: [PATCH 26/34] updating integration tests --- .../supabase/groonga_document_store.py | 113 ++++++++++++------ 1 file changed, 74 insertions(+), 39 deletions(-) diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py index 0fa5b153c4..e1e15873e4 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -9,8 +9,8 @@ from haystack import default_from_dict, default_to_dict, logging from haystack.dataclasses import Document from haystack.document_stores.errors import DuplicateDocumentError -from haystack.errors import FilterError from haystack.document_stores.types import DocumentStore, DuplicatePolicy +from haystack.errors import FilterError from haystack.utils.auth import Secret, deserialize_secrets_inplace from postgrest import CountMethod @@ -156,79 +156,119 @@ def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Docume result = query.execute() return [self._to_haystack_document(row) for row in result.data if isinstance(row, dict)] + @staticmethod + def _meta_col(field: str, value: Any) -> str: + """ + Choose the PostgREST column expression for a meta field. + + Uses the JSONB accessor (->) for numeric values so that PostgREST performs + correct numeric comparison. Uses the text accessor (->>) for strings, booleans, + None, and mixed lists, which return the JSON value as text. + """ + if not field.startswith("meta."): + return field + key = field[len("meta."):] + if isinstance(value, list): + all_numeric = all(isinstance(v, (int, float)) and not isinstance(v, bool) for v in value if v is not None) + return f"meta->{key}" if (all_numeric and value) else f"meta->>{key}" + if isinstance(value, (int, float)) and not isinstance(value, bool): + return f"meta->{key}" + return f"meta->>{key}" + + @staticmethod + def _normalize_value(value: Any) -> Any: + """Convert Python booleans to lowercase strings compatible with JSONB text accessor.""" + if isinstance(value, bool): + return "true" if value else "false" + return value + @staticmethod def _apply_filters(query: Any, filters: dict[str, Any]) -> Any: """ Applies Haystack filters to a PostgREST query builder. - Supports AND logical operator and all standard comparison operators. - OR is supported for simple (non-nested) equality/comparison conditions. + Supports AND, OR, NOT logical operators and all standard comparison operators. + OR and NOT are supported for simple (non-nested) conditions only. :param query: The Supabase query builder. :param filters: Haystack filter dict. :returns: The query with filters applied. - :raises FilterError: For unsupported operators or invalid value types. + :raises FilterError: For unsupported operators, invalid value types, or malformed filters. """ if not filters: return query - # Simple comparison: {"field": "...", "operator": "...", "value": "..."} if "field" in filters: return SupabaseGroongaDocumentStore._apply_condition(query, filters) - op = filters.get("operator", "AND") - conditions = filters.get("conditions", []) + if "operator" not in filters: + msg = "Logical filter must include an 'operator' key ('AND', 'OR', 'NOT')." + raise FilterError(msg) + + if "conditions" not in filters: + msg = "Logical filter must include a 'conditions' key." + raise FilterError(msg) + + op = filters["operator"] + conditions = filters["conditions"] if op == "AND": for cond in conditions: query = SupabaseGroongaDocumentStore._apply_filters(query, cond) return query - if op == "OR": + if op in ("OR", "NOT"): + neg_map = {"==": "neq", "!=": "eq", ">": "lte", ">=": "lt", "<": "gte", "<=": "gt"} + pg_op_map = {"==": "eq", "!=": "neq", ">": "gt", ">=": "gte", "<": "lt", "<=": "lte"} + op_map = neg_map if op == "NOT" else pg_op_map parts = [] for cond in conditions: if "field" not in cond: - msg = "Nested logical operators inside OR are not supported." + msg = f"Nested logical operators inside {op} are not supported." + raise FilterError(msg) + cond_field = cond.get("field", "") + cond_op = cond.get("operator", "") + cond_value = cond.get("value") + if cond_op not in op_map: + msg = f"Operator '{cond_op}' inside {op} filter is not supported." raise FilterError(msg) - parts.append(SupabaseGroongaDocumentStore._condition_to_or_part(cond)) + col = SupabaseGroongaDocumentStore._meta_col(cond_field, cond_value) + norm = SupabaseGroongaDocumentStore._normalize_value(cond_value) + parts.append(f"{col}.{op_map[cond_op]}.{norm}") return query.or_(",".join(parts)) - msg = f"Filter operator '{op}' is not supported. Supported logical operators: AND, OR." + msg = f"Filter operator '{op}' is not supported. Supported logical operators: AND, OR, NOT." raise FilterError(msg) @staticmethod - def _condition_to_or_part(condition: dict[str, Any]) -> str: + def _apply_condition(query: Any, condition: dict[str, Any]) -> Any: field: str = condition.get("field", "") - op: str = condition.get("operator", "==") - value = condition.get("value") - col = f"meta->>{field[len('meta.'):]}" if field.startswith("meta.") else field - pg_op = {"==": "eq", "!=": "neq", ">": "gt", ">=": "gte", "<": "lt", "<=": "lte"} - if op not in pg_op: - msg = f"Operator '{op}' inside OR filter is not supported." + + if "operator" not in condition: + msg = "Comparison filter must include an 'operator' key." raise FilterError(msg) - return f"{col}.{pg_op[op]}.{value}" - @staticmethod - def _apply_condition(query: Any, condition: dict[str, Any]) -> Any: - field: str = condition.get("field", "") - op: str = condition.get("operator", "==") - value = condition.get("value") + if "value" not in condition: + msg = "Comparison filter must include a 'value' key." + raise FilterError(msg) - # PostgREST JSONB text accessor: meta->>key (no quotes around key name) - col = f"meta->>{field[len('meta.'):]}" if field.startswith("meta.") else field + op: str = condition["operator"] + value = condition["value"] + + col = SupabaseGroongaDocumentStore._meta_col(field, value) + norm = SupabaseGroongaDocumentStore._normalize_value(value) if op == "==": - return query.is_(col, "null") if value is None else query.eq(col, value) + return query.is_(col, "null") if norm is None else query.eq(col, norm) if op == "!=": - return query.not_.is_(col, "null") if value is None else query.neq(col, value) + return query.not_.is_(col, "null") if norm is None else query.neq(col, norm) if op in (">", ">=", "<", "<="): if isinstance(value, list): msg = f"Filter operator '{op}' does not support list values." raise FilterError(msg) if value is None: - # No document satisfies an ordering comparison against NULL. return query.eq("id", "") if isinstance(value, str): try: @@ -236,18 +276,13 @@ def _apply_condition(query: Any, condition: dict[str, Any]) -> Any: except ValueError: msg = f"Filter operator '{op}' does not support plain string values. Use a numeric or ISO date value." raise FilterError(msg) - # ISO date strings sort correctly as text; no cast needed. - col_cmp = col - else: - # Numeric value: cast JSONB text to numeric for correct ordering. - col_cmp = f"{col}::numeric" if op == ">": - return query.gt(col_cmp, value) + return query.gt(col, norm) if op == ">=": - return query.gte(col_cmp, value) + return query.gte(col, norm) if op == "<": - return query.lt(col_cmp, value) - return query.lte(col_cmp, value) + return query.lt(col, norm) + return query.lte(col, norm) if op == "in": if not isinstance(value, list): @@ -404,7 +439,7 @@ def _groonga_retrieval( result = self._client.rpc( "groonga_search", - {"query_text": query, "table": self.table_name, "top_k": top_k}, + {"query_text": query, "table_name": self.table_name, "top_k": top_k}, ).execute() data = result.data if isinstance(result.data, list) else [] From 67d52b121e30ecba8627a29cc2039b83535bb497 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 1 Jun 2026 13:30:04 +0200 Subject: [PATCH 27/34] solving linting issues --- .../document_stores/supabase/groonga_document_store.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py index e1e15873e4..1c525be792 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -273,9 +273,9 @@ def _apply_condition(query: Any, condition: dict[str, Any]) -> Any: if isinstance(value, str): try: _datetime.fromisoformat(value) - except ValueError: - msg = f"Filter operator '{op}' does not support plain string values. Use a numeric or ISO date value." - raise FilterError(msg) + except ValueError as err: + msg = f"Filter operator '{op}' does not support string values. Use a numeric or ISO date value." + raise FilterError(msg) from err if op == ">": return query.gt(col, norm) if op == ">=": From 5be825c1035718bc4f52cc77ce543a51dcc404b5 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 1 Jun 2026 13:46:01 +0200 Subject: [PATCH 28/34] fixing filter tests and removing unit tests covered by integration tests --- .../supabase/groonga_document_store.py | 52 +++++++++++++++---- .../tests/test_groonga_document_store.py | 47 +---------------- 2 files changed, 44 insertions(+), 55 deletions(-) diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py index 1c525be792..9364784b08 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -217,24 +217,46 @@ def _apply_filters(query: Any, filters: dict[str, Any]) -> Any: query = SupabaseGroongaDocumentStore._apply_filters(query, cond) return query - if op in ("OR", "NOT"): - neg_map = {"==": "neq", "!=": "eq", ">": "lte", ">=": "lt", "<": "gte", "<=": "gt"} + if op == "OR": pg_op_map = {"==": "eq", "!=": "neq", ">": "gt", ">=": "gte", "<": "lt", "<=": "lte"} - op_map = neg_map if op == "NOT" else pg_op_map parts = [] for cond in conditions: if "field" not in cond: - msg = f"Nested logical operators inside {op} are not supported." + msg = "Nested logical operators inside OR are not supported." + raise FilterError(msg) + cond_field = cond.get("field", "") + cond_op = cond.get("operator", "") + cond_value = cond.get("value") + if cond_op not in pg_op_map: + msg = f"Operator '{cond_op}' inside OR filter is not supported." + raise FilterError(msg) + # Use text accessor (->>): PostgREST OR strings don't support JSONB (->) expressions. + col = f"meta->>{cond_field[len('meta.'):]}" if cond_field.startswith("meta.") else cond_field + norm = SupabaseGroongaDocumentStore._normalize_value(cond_value) + parts.append(f"{col}.{pg_op_map[cond_op]}.{norm}") + return query.or_(",".join(parts)) + + if op == "NOT": + # NOT(A AND B) = NOT_A OR NOT_B, with null-inclusive semantics. + # Use text accessor: PostgREST OR strings don't support JSONB (->) expressions. + neg_map = {"==": "neq", "!=": "eq", ">": "lte", ">=": "lt", "<": "gte", "<=": "gt"} + parts = [] + for cond in conditions: + if "field" not in cond: + msg = "Nested logical operators inside NOT are not supported." raise FilterError(msg) cond_field = cond.get("field", "") cond_op = cond.get("operator", "") cond_value = cond.get("value") - if cond_op not in op_map: - msg = f"Operator '{cond_op}' inside {op} filter is not supported." + if cond_op not in neg_map: + msg = f"Operator '{cond_op}' inside NOT filter is not supported." raise FilterError(msg) - col = SupabaseGroongaDocumentStore._meta_col(cond_field, cond_value) + col = f"meta->>{cond_field[len('meta.'):]}" if cond_field.startswith("meta.") else cond_field norm = SupabaseGroongaDocumentStore._normalize_value(cond_value) - parts.append(f"{col}.{op_map[cond_op]}.{norm}") + parts.append(f"{col}.{neg_map[cond_op]}.{norm}") + if cond_op == "==" and cond_field.startswith("meta."): + # NOT(field==value) also covers docs where the field is absent (SQL NULL semantics) + parts.append(f"{col}.is.null") return query.or_(",".join(parts)) msg = f"Filter operator '{op}' is not supported. Supported logical operators: AND, OR, NOT." @@ -262,7 +284,13 @@ def _apply_condition(query: Any, condition: dict[str, Any]) -> Any: return query.is_(col, "null") if norm is None else query.eq(col, norm) if op == "!=": - return query.not_.is_(col, "null") if norm is None else query.neq(col, norm) + if norm is None: + return query.not_.is_(col, "null") + if field.startswith("meta."): + # SQL: NULL != value returns NULL (not TRUE), so include docs where the field is absent. + key = field[len("meta."):] + return query.or_(f"{col}.neq.{norm},meta->>{key}.is.null") + return query.neq(col, norm) if op in (">", ">=", "<", "<="): if isinstance(value, list): @@ -294,6 +322,12 @@ def _apply_condition(query: Any, condition: dict[str, Any]) -> Any: if not isinstance(value, list): msg = "Filter operator 'not in' requires a list value." raise FilterError(msg) + if field.startswith("meta."): + # SQL: NULL NOT IN (...) returns NULL, so include docs where the field is absent. + key = field[len("meta."):] + non_none = [v for v in value if v is not None] + vals = ",".join(str(v) for v in non_none) + return query.or_(f"{col}.not.in.({vals}),meta->>{key}.is.null") return query.not_.in_(col, value) return query diff --git a/integrations/supabase/tests/test_groonga_document_store.py b/integrations/supabase/tests/test_groonga_document_store.py index 49ec51f00a..adf3d8b74d 100644 --- a/integrations/supabase/tests/test_groonga_document_store.py +++ b/integrations/supabase/tests/test_groonga_document_store.py @@ -134,51 +134,6 @@ def test_write_documents_fail_on_duplicate(self, groonga_store, mock_supabase_cl with pytest.raises(DuplicateDocumentError): groonga_store.write_documents([Document(content="duplicate doc")], policy=DuplicatePolicy.FAIL) - def test_delete_by_filter(self, groonga_store, mock_supabase_client): - mock_table = mock_supabase_client.table.return_value - mock_table.select.return_value.execute.return_value = MagicMock( - data=[{"id": "1", "content": "doc one", "meta": {"lang": "en"}, "score": None}] - ) - mock_table.delete.return_value.in_.return_value.execute.return_value = MagicMock(data=[]) - - deleted = groonga_store.delete_by_filter( - filters={"conditions": [{"field": "meta.lang", "operator": "==", "value": "en"}]} - ) - assert deleted == 1 - - def test_delete_by_filter_no_matches(self, groonga_store, mock_supabase_client): - mock_supabase_client.table.return_value.select.return_value.execute.return_value = MagicMock(data=[]) - - deleted = groonga_store.delete_by_filter( - filters={"conditions": [{"field": "meta.lang", "operator": "==", "value": "fr"}]} - ) - assert deleted == 0 - - def test_update_by_filter(self, groonga_store, mock_supabase_client): - mock_table = mock_supabase_client.table.return_value - mock_table.select.return_value.execute.return_value = MagicMock( - data=[{"id": "1", "content": "doc one", "meta": {"lang": "en"}, "score": None}] - ) - mock_table.upsert.return_value.execute.return_value = MagicMock(data=[{}]) - - updated = groonga_store.update_by_filter( - filters={"conditions": [{"field": "meta.lang", "operator": "==", "value": "en"}]}, - meta={"reviewed": True}, - ) - assert updated == 1 - mock_table.upsert.assert_called_once() - upserted_row = mock_table.upsert.call_args[0][0] - assert upserted_row["meta"] == {"lang": "en", "reviewed": True} - - def test_update_by_filter_no_matches(self, groonga_store, mock_supabase_client): - mock_supabase_client.table.return_value.select.return_value.execute.return_value = MagicMock(data=[]) - - updated = groonga_store.update_by_filter( - filters={"conditions": [{"field": "meta.lang", "operator": "==", "value": "fr"}]}, - meta={"reviewed": True}, - ) - assert updated == 0 - def test_delete_all_documents(self, groonga_store, mock_supabase_client): mock_table = mock_supabase_client.table.return_value mock_table.delete.return_value.neq.return_value.execute.return_value = MagicMock(data=[]) @@ -214,7 +169,7 @@ def test_filter_documents_with_filters(self, groonga_store, mock_supabase_client mock_table.select.return_value.eq.return_value.execute.return_value = MagicMock( data=[{"id": "1", "content": "Python is great", "meta": {"language": "en"}, "score": None}] ) - filters = {"conditions": [{"field": "meta.language", "operator": "==", "value": "en"}]} + filters = {"operator": "AND", "conditions": [{"field": "meta.language", "operator": "==", "value": "en"}]} docs = groonga_store.filter_documents(filters=filters) assert len(docs) == 1 From 520717c7fb544ab6da579524be7229e9fb2959b7 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 1 Jun 2026 13:47:28 +0200 Subject: [PATCH 29/34] formatting --- .../document_stores/supabase/groonga_document_store.py | 10 +++++----- .../supabase/tests/test_groonga_document_store.py | 1 + .../supabase/tests/test_groonga_integration.py | 1 - integrations/supabase/tests/test_groonga_retriever.py | 4 +--- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py index 9364784b08..91f3a2fb87 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -167,7 +167,7 @@ def _meta_col(field: str, value: Any) -> str: """ if not field.startswith("meta."): return field - key = field[len("meta."):] + key = field[len("meta.") :] if isinstance(value, list): all_numeric = all(isinstance(v, (int, float)) and not isinstance(v, bool) for v in value if v is not None) return f"meta->{key}" if (all_numeric and value) else f"meta->>{key}" @@ -231,7 +231,7 @@ def _apply_filters(query: Any, filters: dict[str, Any]) -> Any: msg = f"Operator '{cond_op}' inside OR filter is not supported." raise FilterError(msg) # Use text accessor (->>): PostgREST OR strings don't support JSONB (->) expressions. - col = f"meta->>{cond_field[len('meta.'):]}" if cond_field.startswith("meta.") else cond_field + col = f"meta->>{cond_field[len('meta.') :]}" if cond_field.startswith("meta.") else cond_field norm = SupabaseGroongaDocumentStore._normalize_value(cond_value) parts.append(f"{col}.{pg_op_map[cond_op]}.{norm}") return query.or_(",".join(parts)) @@ -251,7 +251,7 @@ def _apply_filters(query: Any, filters: dict[str, Any]) -> Any: if cond_op not in neg_map: msg = f"Operator '{cond_op}' inside NOT filter is not supported." raise FilterError(msg) - col = f"meta->>{cond_field[len('meta.'):]}" if cond_field.startswith("meta.") else cond_field + col = f"meta->>{cond_field[len('meta.') :]}" if cond_field.startswith("meta.") else cond_field norm = SupabaseGroongaDocumentStore._normalize_value(cond_value) parts.append(f"{col}.{neg_map[cond_op]}.{norm}") if cond_op == "==" and cond_field.startswith("meta."): @@ -288,7 +288,7 @@ def _apply_condition(query: Any, condition: dict[str, Any]) -> Any: return query.not_.is_(col, "null") if field.startswith("meta."): # SQL: NULL != value returns NULL (not TRUE), so include docs where the field is absent. - key = field[len("meta."):] + key = field[len("meta.") :] return query.or_(f"{col}.neq.{norm},meta->>{key}.is.null") return query.neq(col, norm) @@ -324,7 +324,7 @@ def _apply_condition(query: Any, condition: dict[str, Any]) -> Any: raise FilterError(msg) if field.startswith("meta."): # SQL: NULL NOT IN (...) returns NULL, so include docs where the field is absent. - key = field[len("meta."):] + key = field[len("meta.") :] non_none = [v for v in value if v is not None] vals = ",".join(str(v) for v in non_none) return query.or_(f"{col}.not.in.({vals}),meta->>{key}.is.null") diff --git a/integrations/supabase/tests/test_groonga_document_store.py b/integrations/supabase/tests/test_groonga_document_store.py index adf3d8b74d..c6b239a9ce 100644 --- a/integrations/supabase/tests/test_groonga_document_store.py +++ b/integrations/supabase/tests/test_groonga_document_store.py @@ -46,6 +46,7 @@ def groonga_store(mock_supabase_client, monkeypatch): # noqa: ARG001 store.warm_up() return store + class TestDocumentStore: def test_init_defaults(self, monkeypatch): monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") diff --git a/integrations/supabase/tests/test_groonga_integration.py b/integrations/supabase/tests/test_groonga_integration.py index fd684fe535..3423774080 100644 --- a/integrations/supabase/tests/test_groonga_integration.py +++ b/integrations/supabase/tests/test_groonga_integration.py @@ -82,7 +82,6 @@ def document_store(self, request): yield store store.delete_all_documents() - def test_groonga_retrieval(self, document_store): docs = [ Document(content="Python is a great programming language"), diff --git a/integrations/supabase/tests/test_groonga_retriever.py b/integrations/supabase/tests/test_groonga_retriever.py index 666998ccf2..7e165166e9 100644 --- a/integrations/supabase/tests/test_groonga_retriever.py +++ b/integrations/supabase/tests/test_groonga_retriever.py @@ -95,9 +95,7 @@ def test_to_dict(self, groonga_store): def test_from_dict(self, mock_supabase_client, monkeypatch): # noqa: ARG002 monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") data = { - "type": ( - "haystack_integrations.components.retrievers.supabase.groonga_retriever.SupabaseGroongaRetriever" - ), + "type": ("haystack_integrations.components.retrievers.supabase.groonga_retriever.SupabaseGroongaRetriever"), "init_parameters": { "top_k": 7, "filters": {}, From 48c249dd1e0e7eb27db3bad9d9f8c9f577a7c0e0 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 1 Jun 2026 14:15:14 +0200 Subject: [PATCH 30/34] renaming the retriever since it only handles full text for consistency and pinning dependencies for CountMethod to work --- integrations/supabase/pyproject.toml | 2 +- .../retrievers/supabase/__init__.py | 4 +- .../retrievers/supabase/groonga_retriever.py | 153 ------------------ .../supabase/groonga_document_store.py | 17 +- .../tests/test_groonga_integration.py | 6 +- .../supabase/tests/test_groonga_retriever.py | 22 +-- 6 files changed, 31 insertions(+), 173 deletions(-) delete mode 100644 integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py diff --git a/integrations/supabase/pyproject.toml b/integrations/supabase/pyproject.toml index fdaf412c09..aee0d50272 100644 --- a/integrations/supabase/pyproject.toml +++ b/integrations/supabase/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai>=2.26.1", "pgvector-haystack>=6.3.0", "supabase>=2.9.0"] +dependencies = ["haystack-ai>=2.26.1", "pgvector-haystack>=6.3.0", "supabase>=2.23.0"] [project.urls] Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase#readme" diff --git a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py index 30c347418c..e1f90930c4 100644 --- a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py +++ b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py @@ -2,11 +2,11 @@ # # SPDX-License-Identifier: Apache-2.0 from .embedding_retriever import SupabasePgvectorEmbeddingRetriever -from .groonga_retriever import SupabaseGroongaRetriever +from .groonga_bm25_retriever import SupabaseGroongaBM25Retriever from .keyword_retriever import SupabasePgvectorKeywordRetriever __all__ = [ - "SupabaseGroongaRetriever", + "SupabaseGroongaBM25Retriever", "SupabasePgvectorEmbeddingRetriever", "SupabasePgvectorKeywordRetriever", ] diff --git a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py deleted file mode 100644 index 36f4526342..0000000000 --- a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_retriever.py +++ /dev/null @@ -1,153 +0,0 @@ -# SPDX-FileCopyrightText: 2023-present deepset GmbH -# -# SPDX-License-Identifier: Apache-2.0 - -import copy -from typing import Any - -from haystack import component, default_from_dict, default_to_dict -from haystack.dataclasses import Document -from haystack.document_stores.types import FilterPolicy -from haystack.document_stores.types.filter_policy import apply_filter_policy - -from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore - - -@component -class SupabaseGroongaRetriever: - """ - Retrieves documents from SupabaseGroongaDocumentStore using PGroonga full-text search. - - This retriever works without embeddings — it searches documents using plain text queries. - It can be used alongside SupabasePgvectorEmbeddingRetriever in hybrid search pipelines. - - Note: async operations are not supported as the supabase-py sync client does not expose - awaitable query methods. Use the sync run() method instead. - - Example usage: - - ```python - from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore - from haystack_integrations.components.retrievers.supabase import SupabaseGroongaRetriever - from haystack.utils import Secret - - document_store = SupabaseGroongaDocumentStore( - supabase_url="https://.supabase.co", - supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), - table_name="haystack_fts_documents", - ) - document_store.warm_up() - - retriever = SupabaseGroongaRetriever(document_store=document_store, top_k=10) - result = retriever.run(query="python programming") - print(result["documents"]) - ``` - """ - - def __init__( - self, - *, - document_store: SupabaseGroongaDocumentStore, - filters: dict[str, Any] | None = None, - top_k: int = 10, - filter_policy: str | FilterPolicy = FilterPolicy.REPLACE, - ) -> None: - """ - Initialize the SupabaseGroongaRetriever. - - :param document_store: An instance of SupabaseGroongaDocumentStore. - :param filters: Optional filters applied to retrieved Documents. - :param top_k: Maximum number of Documents to return. Defaults to 10. - :param filter_policy: Policy to determine how filters are applied. - :raises ValueError: If document_store is not an instance of SupabaseGroongaDocumentStore. - """ - if not isinstance(document_store, SupabaseGroongaDocumentStore): - msg = "document_store must be an instance of SupabaseGroongaDocumentStore" - raise ValueError(msg) - - self.document_store = document_store - self.filters = filters or {} - self.top_k = top_k - self.filter_policy = ( - filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy) - ) - - @component.output_types(documents=list[Document]) - def run( - self, - query: str, - filters: dict[str, Any] | None = None, - top_k: int | None = None, - ) -> dict[str, list[Document]]: - """ - Runs the retriever on the given query. - - :param query: The text query to search for. - :param filters: Optional runtime filters. Merged or replaced based on filter_policy. - :param top_k: Optional override for maximum number of documents to return. - :returns: Dictionary with key "documents" containing list of matching Documents. - """ - if not query: - return {"documents": []} - - merged_filters = apply_filter_policy(self.filter_policy, self.filters, filters) - effective_top_k = top_k if top_k is not None else self.top_k - - documents = self.document_store._groonga_retrieval( - query=query, - top_k=effective_top_k, - filters=merged_filters, - ) - - return {"documents": documents} - - @component.output_types(documents=list[Document]) - async def run_async( - self, - query: str, - filters: dict[str, Any] | None = None, - top_k: int | None = None, - ) -> dict[str, list[Document]]: - """ - Async version of run(). - - Note: supabase-py's sync client does not support native async queries. - This method runs the synchronous retrieval and returns the result. - For fully async support, consider using acreate_client() from supabase-py - and refactoring the document store accordingly. - - :param query: The text query to search for. - :param filters: Optional runtime filters. Merged or replaced based on filter_policy. - :param top_k: Optional override for maximum number of documents to return. - :returns: Dictionary with key "documents" containing list of matching Documents. - """ - return self.run(query=query, filters=filters, top_k=top_k) - - def to_dict(self) -> dict[str, Any]: - """ - Serializes the component to a dictionary. - - :returns: Dictionary with serialized data. - """ - return default_to_dict( - self, - filters=self.filters, - top_k=self.top_k, - filter_policy=self.filter_policy.value, - document_store=self.document_store.to_dict(), - ) - - @classmethod - def from_dict(cls, data: dict[str, Any]) -> "SupabaseGroongaRetriever": - """ - Deserializes the component from a dictionary. - - :param data: Dictionary to deserialize from. - :returns: Deserialized component. - """ - data = copy.deepcopy(data) - doc_store_params = data["init_parameters"]["document_store"] - data["init_parameters"]["document_store"] = SupabaseGroongaDocumentStore.from_dict(doc_store_params) - if filter_policy := data["init_parameters"].get("filter_policy"): - data["init_parameters"]["filter_policy"] = FilterPolicy.from_str(filter_policy) - return default_from_dict(cls, data) diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py index 91f3a2fb87..7072879b40 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -138,11 +138,22 @@ def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Docume """ Returns documents matching the given filters. - Supported filters: equality filters on `id`, `content`, and `meta` fields. + Supports the standard Haystack filter syntax with the following operators: - :param filters: Optional dictionary of filters. - Example: ``{"field": "meta.language", "operator": "==", "value": "en"}`` + - Comparison: ``==``, ``!=``, ``>``, ``>=``, ``<``, ``<=``, ``in``, ``not in`` + - Logical: ``AND``, ``OR``, ``NOT`` (``OR`` and ``NOT`` support simple conditions + only — no nested logical operators inside them) + + **Known limitation:** For ``!=`` and ``not in`` on ``meta.*`` fields, documents + where the field is absent are included in the result (matching Python ``None != value`` + semantics). For ``>`` / ``>=`` / ``<`` / ``<=``, documents where the field is absent + are excluded (SQL ``NULL`` comparison semantics). + + :param filters: Optional Haystack filter dict. + Simple comparison: ``{"field": "meta.language", "operator": "==", "value": "en"}`` + Logical: ``{"operator": "AND", "conditions": [...]}`` :returns: List of matching Document objects. + :raises FilterError: If the filter structure is malformed or uses an unsupported operator. """ if self._client is None: msg = "Call warm_up() before using the document store." diff --git a/integrations/supabase/tests/test_groonga_integration.py b/integrations/supabase/tests/test_groonga_integration.py index 3423774080..e8dc1e8eba 100644 --- a/integrations/supabase/tests/test_groonga_integration.py +++ b/integrations/supabase/tests/test_groonga_integration.py @@ -16,7 +16,7 @@ ) from haystack.utils import Secret -from haystack_integrations.components.retrievers.supabase import SupabaseGroongaRetriever +from haystack_integrations.components.retrievers.supabase import SupabaseGroongaBM25Retriever from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore # Defaults for the local Docker stack (docker-compose-groonga.yml). @@ -108,7 +108,7 @@ def test_retriever_run(self, document_store): ] document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - retriever = SupabaseGroongaRetriever(document_store=document_store, top_k=5) + retriever = SupabaseGroongaBM25Retriever(document_store=document_store, top_k=5) result = retriever.run(query="Python") assert "documents" in result @@ -116,5 +116,5 @@ def test_retriever_run(self, document_store): assert any("Python" in doc.content for doc in result["documents"]) def test_retriever_empty_query(self, document_store): - retriever = SupabaseGroongaRetriever(document_store=document_store) + retriever = SupabaseGroongaBM25Retriever(document_store=document_store) assert retriever.run(query="") == {"documents": []} diff --git a/integrations/supabase/tests/test_groonga_retriever.py b/integrations/supabase/tests/test_groonga_retriever.py index 7e165166e9..d172946f4d 100644 --- a/integrations/supabase/tests/test_groonga_retriever.py +++ b/integrations/supabase/tests/test_groonga_retriever.py @@ -6,7 +6,7 @@ import pytest -from haystack_integrations.components.retrievers.supabase import SupabaseGroongaRetriever +from haystack_integrations.components.retrievers.supabase import SupabaseGroongaBM25Retriever from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore @@ -48,26 +48,26 @@ def groonga_store(mock_supabase_client, monkeypatch): # noqa: ARG001 class TestRetriever: def test_init_invalid_store(self): with pytest.raises(ValueError, match="document_store must be an instance"): - SupabaseGroongaRetriever(document_store="not_a_store") + SupabaseGroongaBM25Retriever(document_store="not_a_store") def test_init(self, groonga_store): - retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) + retriever = SupabaseGroongaBM25Retriever(document_store=groonga_store, top_k=5) assert retriever.top_k == 5 assert retriever.document_store is groonga_store def test_init_default_top_k(self, groonga_store): - retriever = SupabaseGroongaRetriever(document_store=groonga_store) + retriever = SupabaseGroongaBM25Retriever(document_store=groonga_store) assert retriever.top_k == 10 def test_run_empty_query(self, groonga_store): - retriever = SupabaseGroongaRetriever(document_store=groonga_store) + retriever = SupabaseGroongaBM25Retriever(document_store=groonga_store) assert retriever.run(query="") == {"documents": []} def test_run(self, groonga_store, mock_supabase_client): mock_supabase_client.rpc.return_value.execute.return_value = MagicMock( data=[{"id": "1", "content": "Python is great", "meta": {}, "score": 1.0}] ) - retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) + retriever = SupabaseGroongaBM25Retriever(document_store=groonga_store, top_k=5) result = retriever.run(query="Python") assert len(result["documents"]) == 1 assert result["documents"][0].content == "Python is great" @@ -77,17 +77,17 @@ async def test_run_async(self, groonga_store, mock_supabase_client): mock_supabase_client.rpc.return_value.execute.return_value = MagicMock( data=[{"id": "1", "content": "Python is great", "meta": {}, "score": 1.0}] ) - retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) + retriever = SupabaseGroongaBM25Retriever(document_store=groonga_store, top_k=5) result = await retriever.run_async(query="Python") assert len(result["documents"]) == 1 @pytest.mark.asyncio async def test_run_async_empty_query(self, groonga_store): - retriever = SupabaseGroongaRetriever(document_store=groonga_store) + retriever = SupabaseGroongaBM25Retriever(document_store=groonga_store) assert await retriever.run_async(query="") == {"documents": []} def test_to_dict(self, groonga_store): - retriever = SupabaseGroongaRetriever(document_store=groonga_store, top_k=5) + retriever = SupabaseGroongaBM25Retriever(document_store=groonga_store, top_k=5) result = retriever.to_dict() assert result["init_parameters"]["top_k"] == 5 assert "document_store" in result["init_parameters"] @@ -95,7 +95,7 @@ def test_to_dict(self, groonga_store): def test_from_dict(self, mock_supabase_client, monkeypatch): # noqa: ARG002 monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") data = { - "type": ("haystack_integrations.components.retrievers.supabase.groonga_retriever.SupabaseGroongaRetriever"), + "type": ("haystack_integrations.components.retrievers.supabase.groonga_bm25_retriever.SupabaseGroongaBM25Retriever"), # noqa: E501 "init_parameters": { "top_k": 7, "filters": {}, @@ -118,5 +118,5 @@ def test_from_dict(self, mock_supabase_client, monkeypatch): # noqa: ARG002 }, }, } - retriever = SupabaseGroongaRetriever.from_dict(data) + retriever = SupabaseGroongaBM25Retriever.from_dict(data) assert retriever.top_k == 7 From d6672759906aa95b683f5af4bc43e7b983e85e01 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 1 Jun 2026 14:18:54 +0200 Subject: [PATCH 31/34] formatting --- integrations/supabase/tests/test_groonga_retriever.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/integrations/supabase/tests/test_groonga_retriever.py b/integrations/supabase/tests/test_groonga_retriever.py index d172946f4d..cdd121c13f 100644 --- a/integrations/supabase/tests/test_groonga_retriever.py +++ b/integrations/supabase/tests/test_groonga_retriever.py @@ -95,7 +95,9 @@ def test_to_dict(self, groonga_store): def test_from_dict(self, mock_supabase_client, monkeypatch): # noqa: ARG002 monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") data = { - "type": ("haystack_integrations.components.retrievers.supabase.groonga_bm25_retriever.SupabaseGroongaBM25Retriever"), # noqa: E501 + "type": ( + "haystack_integrations.components.retrievers.supabase.groonga_bm25_retriever.SupabaseGroongaBM25Retriever" + ), # noqa: E501 "init_parameters": { "top_k": 7, "filters": {}, From 0c38dc6cd2a7f15959521ab5b82bb0a995d860bc Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 1 Jun 2026 14:20:58 +0200 Subject: [PATCH 32/34] removing unused ignore --- integrations/supabase/tests/test_groonga_retriever.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/supabase/tests/test_groonga_retriever.py b/integrations/supabase/tests/test_groonga_retriever.py index cdd121c13f..21102008b2 100644 --- a/integrations/supabase/tests/test_groonga_retriever.py +++ b/integrations/supabase/tests/test_groonga_retriever.py @@ -97,7 +97,7 @@ def test_from_dict(self, mock_supabase_client, monkeypatch): # noqa: ARG002 data = { "type": ( "haystack_integrations.components.retrievers.supabase.groonga_bm25_retriever.SupabaseGroongaBM25Retriever" - ), # noqa: E501 + ), "init_parameters": { "top_k": 7, "filters": {}, From b8b0ef3a1f93cd0b618bae2d785a9139fb279e02 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 1 Jun 2026 14:27:13 +0200 Subject: [PATCH 33/34] adding missed file --- .../retrievers/supabase/__init__.py | 1 + .../supabase/groonga_bm25_retriever.py | 153 ++++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100644 integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_bm25_retriever.py diff --git a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py index e1f90930c4..fc330a7028 100644 --- a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py +++ b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 + from .embedding_retriever import SupabasePgvectorEmbeddingRetriever from .groonga_bm25_retriever import SupabaseGroongaBM25Retriever from .keyword_retriever import SupabasePgvectorKeywordRetriever diff --git a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_bm25_retriever.py b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_bm25_retriever.py new file mode 100644 index 0000000000..8442457478 --- /dev/null +++ b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_bm25_retriever.py @@ -0,0 +1,153 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import copy +from typing import Any + +from haystack import component, default_from_dict, default_to_dict +from haystack.dataclasses import Document +from haystack.document_stores.types import FilterPolicy +from haystack.document_stores.types.filter_policy import apply_filter_policy + +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore + + +@component +class SupabaseGroongaBM25Retriever: + """ + Retrieves documents from SupabaseGroongaDocumentStore using PGroonga full-text search. + + This retriever works without embeddings — it searches documents using plain text queries. + It can be used alongside SupabasePgvectorEmbeddingRetriever in hybrid search pipelines. + + Note: async operations are not supported as the supabase-py sync client does not expose + awaitable query methods. Use the sync run() method instead. + + Example usage: + + ```python + from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore + from haystack_integrations.components.retrievers.supabase import SupabaseGroongaBM25Retriever + from haystack.utils import Secret + + document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_fts_documents", + ) + document_store.warm_up() + + retriever = SupabaseGroongaBM25Retriever(document_store=document_store, top_k=10) + result = retriever.run(query="python programming") + print(result["documents"]) + ``` + """ + + def __init__( + self, + *, + document_store: SupabaseGroongaDocumentStore, + filters: dict[str, Any] | None = None, + top_k: int = 10, + filter_policy: str | FilterPolicy = FilterPolicy.REPLACE, + ) -> None: + """ + Initialize the SupabaseGroongaBM25Retriever. + + :param document_store: An instance of SupabaseGroongaDocumentStore. + :param filters: Optional filters applied to retrieved Documents. + :param top_k: Maximum number of Documents to return. Defaults to 10. + :param filter_policy: Policy to determine how filters are applied. + :raises ValueError: If document_store is not an instance of SupabaseGroongaDocumentStore. + """ + if not isinstance(document_store, SupabaseGroongaDocumentStore): + msg = "document_store must be an instance of SupabaseGroongaDocumentStore" + raise ValueError(msg) + + self.document_store = document_store + self.filters = filters or {} + self.top_k = top_k + self.filter_policy = ( + filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy) + ) + + @component.output_types(documents=list[Document]) + def run( + self, + query: str, + filters: dict[str, Any] | None = None, + top_k: int | None = None, + ) -> dict[str, list[Document]]: + """ + Runs the retriever on the given query. + + :param query: The text query to search for. + :param filters: Optional runtime filters. Merged or replaced based on filter_policy. + :param top_k: Optional override for maximum number of documents to return. + :returns: Dictionary with key "documents" containing list of matching Documents. + """ + if not query: + return {"documents": []} + + merged_filters = apply_filter_policy(self.filter_policy, self.filters, filters) + effective_top_k = top_k if top_k is not None else self.top_k + + documents = self.document_store._groonga_retrieval( + query=query, + top_k=effective_top_k, + filters=merged_filters, + ) + + return {"documents": documents} + + @component.output_types(documents=list[Document]) + async def run_async( + self, + query: str, + filters: dict[str, Any] | None = None, + top_k: int | None = None, + ) -> dict[str, list[Document]]: + """ + Async version of run(). + + Note: supabase-py's sync client does not support native async queries. + This method runs the synchronous retrieval and returns the result. + For fully async support, consider using acreate_client() from supabase-py + and refactoring the document store accordingly. + + :param query: The text query to search for. + :param filters: Optional runtime filters. Merged or replaced based on filter_policy. + :param top_k: Optional override for maximum number of documents to return. + :returns: Dictionary with key "documents" containing list of matching Documents. + """ + return self.run(query=query, filters=filters, top_k=top_k) + + def to_dict(self) -> dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: Dictionary with serialized data. + """ + return default_to_dict( + self, + filters=self.filters, + top_k=self.top_k, + filter_policy=self.filter_policy.value, + document_store=self.document_store.to_dict(), + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "SupabaseGroongaBM25Retriever": + """ + Deserializes the component from a dictionary. + + :param data: Dictionary to deserialize from. + :returns: Deserialized component. + """ + data = copy.deepcopy(data) + doc_store_params = data["init_parameters"]["document_store"] + data["init_parameters"]["document_store"] = SupabaseGroongaDocumentStore.from_dict(doc_store_params) + if filter_policy := data["init_parameters"].get("filter_policy"): + data["init_parameters"]["filter_policy"] = FilterPolicy.from_str(filter_policy) + return default_from_dict(cls, data) From 13c8032394caca63685330ee6bfe11baa198d045 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 1 Jun 2026 15:32:04 +0200 Subject: [PATCH 34/34] documenting integration tests --- .../tests/test_groonga_integration.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/integrations/supabase/tests/test_groonga_integration.py b/integrations/supabase/tests/test_groonga_integration.py index e8dc1e8eba..9d0fdf5d3e 100644 --- a/integrations/supabase/tests/test_groonga_integration.py +++ b/integrations/supabase/tests/test_groonga_integration.py @@ -2,6 +2,28 @@ # # SPDX-License-Identifier: Apache-2.0 +# Integration tests for SupabaseGroongaDocumentStore and SupabaseGroongaBM25Retriever. +# +# These tests require a running stack of three Docker containers defined in +# docker-compose-groonga.yml: +# +# pgroonga-postgres PostgreSQL 17 + PGroonga extension (port 5433) +# postgrest PostgREST REST API on top of PostgreSQL (port 3000) +# nginx Reverse proxy that strips the /rest/v1/ prefix that +# supabase-py always appends, then forwards to PostgREST +# (port 8000 — the URL used by supabase-py) +# +# Start the stack locally with: +# make docker-groonga +# +# The test fixture falls back to http://localhost:8000 when SUPABASE_URL is not +# set, so no environment variables are required for local development. +# +# All tests share a single pre-created table (haystack_groonga_test) defined in +# init-pgroonga.sql. PostgREST caches its schema at startup and does not reload +# it for tables created later, so the table must exist before PostgREST starts. +# Data is cleared in fixture teardown instead of recreating the table. + import os import pytest