diff --git a/.github/workflows/supabase.yml b/.github/workflows/supabase.yml index c2abdf5340..33d2a0b8ed 100644 --- a/.github/workflows/supabase.yml +++ b/.github/workflows/supabase.yml @@ -120,11 +120,31 @@ jobs: name: coverage-comment-supabase path: python-coverage-comment-action-supabase.txt - - name: Run integration tests + - name: Run pgvector integration tests if: runner.os == 'Linux' env: SUPABASE_DB_URL: "postgresql://postgres:postgres@localhost:5432/postgres" - run: hatch run test:integration-cov-append-retry + run: hatch run test:integration-cov-append-retry --ignore=tests/test_groonga_integration.py + + - name: Start PGroonga + PostgREST stack + if: runner.os == 'Linux' + run: docker compose -f docker-compose-groonga.yml up -d --build + + - name: Wait for PGroonga stack to be ready + if: runner.os == 'Linux' + run: | + for i in $(seq 1 30); do + if curl -sf http://localhost:8000/rest/v1/ -H "apikey: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hj04zWl196z2-SBc0"; then + echo "PGroonga stack is ready" + break + fi + echo "Waiting for PGroonga stack... ($i/30)" + sleep 5 + done + + - name: Run PGroonga integration tests + if: runner.os == 'Linux' + run: hatch run test:integration-cov-append-retry tests/test_groonga_integration.py - name: Store combined coverage if: github.event_name == 'push' diff --git a/integrations/supabase/Dockerfile.pgroonga b/integrations/supabase/Dockerfile.pgroonga new file mode 100644 index 0000000000..a6623b41c6 --- /dev/null +++ b/integrations/supabase/Dockerfile.pgroonga @@ -0,0 +1,10 @@ +FROM postgres:17-bookworm + +RUN apt-get update && \ + apt-get install -y wget gnupg2 && \ + wget -q -O /tmp/groonga-apt-source.deb \ + https://packages.groonga.org/debian/groonga-apt-source-latest-bookworm.deb && \ + dpkg -i /tmp/groonga-apt-source.deb && \ + apt-get update && \ + apt-get install -y postgresql-17-pgdg-pgroonga && \ + rm -rf /var/lib/apt/lists/* /tmp/groonga-apt-source.deb diff --git a/integrations/supabase/docker-compose-groonga.yml b/integrations/supabase/docker-compose-groonga.yml new file mode 100644 index 0000000000..4886fb972c --- /dev/null +++ b/integrations/supabase/docker-compose-groonga.yml @@ -0,0 +1,42 @@ +services: + pgroonga-postgres: + build: + context: . + dockerfile: Dockerfile.pgroonga + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: postgres + ports: + - "5433:5432" + volumes: + - ./init-pgroonga.sql:/docker-entrypoint-initdb.d/init-pgroonga.sql + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 10s + timeout: 5s + retries: 10 + + postgrest: + image: postgrest/postgrest:v12.2.0 + environment: + PGRST_DB_URI: postgres://postgres:postgres@pgroonga-postgres:5432/postgres + PGRST_DB_SCHEMAS: public + # No PGRST_JWT_SECRET → JWT validation disabled; all requests run as PGRST_DB_ANON_ROLE. + # supabase-py still sends an apikey header but PostgREST ignores it. + PGRST_DB_ANON_ROLE: postgres + PGRST_LOG_LEVEL: info + ports: + - "3000:3000" + depends_on: + pgroonga-postgres: + condition: service_healthy + + nginx: + image: nginx:alpine + ports: + - "8000:8000" + volumes: + - ./nginx-groonga.conf:/etc/nginx/nginx.conf:ro + depends_on: + - postgrest diff --git a/integrations/supabase/init-pgroonga.sql b/integrations/supabase/init-pgroonga.sql new file mode 100644 index 0000000000..745428b823 --- /dev/null +++ b/integrations/supabase/init-pgroonga.sql @@ -0,0 +1,62 @@ +-- Enable PGroonga extension +CREATE EXTENSION IF NOT EXISTS pgroonga; + +-- PostgreSQL role that PostgREST switches to when a service_role JWT is presented. +-- The role must exist before PostgREST connects. +DO $$ +BEGIN + IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'service_role') THEN + CREATE ROLE service_role NOLOGIN; + END IF; +END +$$; + +GRANT ALL ON SCHEMA public TO service_role; +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO service_role; +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO service_role; +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON FUNCTIONS TO service_role; + +-- exec_sql: allows the document store to create/drop tables and indexes via RPC. +CREATE OR REPLACE FUNCTION exec_sql(query TEXT) +RETURNS VOID AS $$ +BEGIN + EXECUTE query; +END; +$$ LANGUAGE plpgsql SECURITY DEFINER; + +GRANT EXECUTE ON FUNCTION exec_sql(TEXT) TO service_role; + +-- groonga_search: full-text search via PGroonga, called by _groonga_retrieval(). +CREATE OR REPLACE FUNCTION groonga_search(query_text TEXT, table_name TEXT, top_k INT) +RETURNS TABLE(id TEXT, content TEXT, meta JSONB, score REAL) AS $$ +DECLARE + sql TEXT; +BEGIN + sql := format( + 'SELECT id, content, meta, pgroonga_score(tableoid, ctid)::REAL AS score + FROM %I + WHERE content &@~ %L + ORDER BY score DESC + LIMIT %s', + table_name, query_text, top_k + ); + RETURN QUERY EXECUTE sql; +END; +$$ LANGUAGE plpgsql; + +GRANT EXECUTE ON FUNCTION groonga_search(TEXT, TEXT, INT) TO service_role; + +-- Pre-create the test table so PostgREST includes it in its schema cache at startup. +-- Tests use this fixed table and clear data between runs instead of recreating the table. +CREATE TABLE IF NOT EXISTS haystack_groonga_test ( + id TEXT PRIMARY KEY, + content TEXT, + meta JSONB, + score REAL +); + +CREATE INDEX IF NOT EXISTS pgroonga_haystack_groonga_test_index +ON haystack_groonga_test +USING pgroonga (content); + +GRANT ALL ON TABLE haystack_groonga_test TO postgres; diff --git a/integrations/supabase/nginx-groonga.conf b/integrations/supabase/nginx-groonga.conf new file mode 100644 index 0000000000..61ad7f3ee8 --- /dev/null +++ b/integrations/supabase/nginx-groonga.conf @@ -0,0 +1,18 @@ +# Minimal reverse proxy so supabase-py (which appends /rest/v1/) reaches PostgREST. +events {} + +http { + server { + listen 8000; + + location /rest/v1/ { + rewrite ^/rest/v1/(.*)$ /$1 break; + proxy_pass http://postgrest:3000; + proxy_set_header Host $host; + # Strip auth headers — PostgREST has no JWT secret configured, + # so all requests run as the anon role (postgres). + proxy_set_header Authorization ""; + proxy_set_header apikey ""; + } + } +} diff --git a/integrations/supabase/pyproject.toml b/integrations/supabase/pyproject.toml index ae2306c28e..aee0d50272 100644 --- a/integrations/supabase/pyproject.toml +++ b/integrations/supabase/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai>=2.26.1", "pgvector-haystack>=6.3.0", "supabase>=2.9.0"] +dependencies = ["haystack-ai>=2.26.1", "pgvector-haystack>=6.3.0", "supabase>=2.23.0"] [project.urls] Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase#readme" @@ -58,6 +58,7 @@ dependencies = [ "pytest-rerunfailures", "mypy", "pip", + "supabase", ] [tool.hatch.envs.test.scripts] @@ -153,6 +154,7 @@ show_missing = true exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] [tool.pytest.ini_options] +asyncio_mode = "auto" addopts = "--strict-markers" markers = [ "integration: integration tests", diff --git a/integrations/supabase/pytest b/integrations/supabase/pytest new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py index fdc5a89c23..fc330a7028 100644 --- a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py +++ b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/__init__.py @@ -3,6 +3,11 @@ # SPDX-License-Identifier: Apache-2.0 from .embedding_retriever import SupabasePgvectorEmbeddingRetriever +from .groonga_bm25_retriever import SupabaseGroongaBM25Retriever from .keyword_retriever import SupabasePgvectorKeywordRetriever -__all__ = ["SupabasePgvectorEmbeddingRetriever", "SupabasePgvectorKeywordRetriever"] +__all__ = [ + "SupabaseGroongaBM25Retriever", + "SupabasePgvectorEmbeddingRetriever", + "SupabasePgvectorKeywordRetriever", +] diff --git a/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_bm25_retriever.py b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_bm25_retriever.py new file mode 100644 index 0000000000..8442457478 --- /dev/null +++ b/integrations/supabase/src/haystack_integrations/components/retrievers/supabase/groonga_bm25_retriever.py @@ -0,0 +1,153 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import copy +from typing import Any + +from haystack import component, default_from_dict, default_to_dict +from haystack.dataclasses import Document +from haystack.document_stores.types import FilterPolicy +from haystack.document_stores.types.filter_policy import apply_filter_policy + +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore + + +@component +class SupabaseGroongaBM25Retriever: + """ + Retrieves documents from SupabaseGroongaDocumentStore using PGroonga full-text search. + + This retriever works without embeddings — it searches documents using plain text queries. + It can be used alongside SupabasePgvectorEmbeddingRetriever in hybrid search pipelines. + + Note: async operations are not supported as the supabase-py sync client does not expose + awaitable query methods. Use the sync run() method instead. + + Example usage: + + ```python + from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore + from haystack_integrations.components.retrievers.supabase import SupabaseGroongaBM25Retriever + from haystack.utils import Secret + + document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_fts_documents", + ) + document_store.warm_up() + + retriever = SupabaseGroongaBM25Retriever(document_store=document_store, top_k=10) + result = retriever.run(query="python programming") + print(result["documents"]) + ``` + """ + + def __init__( + self, + *, + document_store: SupabaseGroongaDocumentStore, + filters: dict[str, Any] | None = None, + top_k: int = 10, + filter_policy: str | FilterPolicy = FilterPolicy.REPLACE, + ) -> None: + """ + Initialize the SupabaseGroongaBM25Retriever. + + :param document_store: An instance of SupabaseGroongaDocumentStore. + :param filters: Optional filters applied to retrieved Documents. + :param top_k: Maximum number of Documents to return. Defaults to 10. + :param filter_policy: Policy to determine how filters are applied. + :raises ValueError: If document_store is not an instance of SupabaseGroongaDocumentStore. + """ + if not isinstance(document_store, SupabaseGroongaDocumentStore): + msg = "document_store must be an instance of SupabaseGroongaDocumentStore" + raise ValueError(msg) + + self.document_store = document_store + self.filters = filters or {} + self.top_k = top_k + self.filter_policy = ( + filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy) + ) + + @component.output_types(documents=list[Document]) + def run( + self, + query: str, + filters: dict[str, Any] | None = None, + top_k: int | None = None, + ) -> dict[str, list[Document]]: + """ + Runs the retriever on the given query. + + :param query: The text query to search for. + :param filters: Optional runtime filters. Merged or replaced based on filter_policy. + :param top_k: Optional override for maximum number of documents to return. + :returns: Dictionary with key "documents" containing list of matching Documents. + """ + if not query: + return {"documents": []} + + merged_filters = apply_filter_policy(self.filter_policy, self.filters, filters) + effective_top_k = top_k if top_k is not None else self.top_k + + documents = self.document_store._groonga_retrieval( + query=query, + top_k=effective_top_k, + filters=merged_filters, + ) + + return {"documents": documents} + + @component.output_types(documents=list[Document]) + async def run_async( + self, + query: str, + filters: dict[str, Any] | None = None, + top_k: int | None = None, + ) -> dict[str, list[Document]]: + """ + Async version of run(). + + Note: supabase-py's sync client does not support native async queries. + This method runs the synchronous retrieval and returns the result. + For fully async support, consider using acreate_client() from supabase-py + and refactoring the document store accordingly. + + :param query: The text query to search for. + :param filters: Optional runtime filters. Merged or replaced based on filter_policy. + :param top_k: Optional override for maximum number of documents to return. + :returns: Dictionary with key "documents" containing list of matching Documents. + """ + return self.run(query=query, filters=filters, top_k=top_k) + + def to_dict(self) -> dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: Dictionary with serialized data. + """ + return default_to_dict( + self, + filters=self.filters, + top_k=self.top_k, + filter_policy=self.filter_policy.value, + document_store=self.document_store.to_dict(), + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "SupabaseGroongaBM25Retriever": + """ + Deserializes the component from a dictionary. + + :param data: Dictionary to deserialize from. + :returns: Deserialized component. + """ + data = copy.deepcopy(data) + doc_store_params = data["init_parameters"]["document_store"] + data["init_parameters"]["document_store"] = SupabaseGroongaDocumentStore.from_dict(doc_store_params) + if filter_policy := data["init_parameters"].get("filter_policy"): + data["init_parameters"]["filter_policy"] = FilterPolicy.from_str(filter_policy) + return default_from_dict(cls, data) diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/__init__.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/__init__.py index 7512a97b75..06d697a980 100644 --- a/integrations/supabase/src/haystack_integrations/document_stores/supabase/__init__.py +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/__init__.py @@ -2,5 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 from .document_store import SupabasePgvectorDocumentStore +from .groonga_document_store import SupabaseGroongaDocumentStore -__all__ = ["SupabasePgvectorDocumentStore"] +__all__ = [ + "SupabaseGroongaDocumentStore", + "SupabasePgvectorDocumentStore", +] diff --git a/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py new file mode 100644 index 0000000000..7072879b40 --- /dev/null +++ b/integrations/supabase/src/haystack_integrations/document_stores/supabase/groonga_document_store.py @@ -0,0 +1,576 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import re +from datetime import datetime as _datetime +from typing import Any + +from haystack import default_from_dict, default_to_dict, logging +from haystack.dataclasses import Document +from haystack.document_stores.errors import DuplicateDocumentError +from haystack.document_stores.types import DocumentStore, DuplicatePolicy +from haystack.errors import FilterError +from haystack.utils.auth import Secret, deserialize_secrets_inplace +from postgrest import CountMethod + +from supabase import Client, create_client + +logger = logging.getLogger(__name__) + + +class SupabaseGroongaDocumentStore(DocumentStore): + """ + A Document Store for Supabase using PGroonga for full-text search. + + PGroonga is a PostgreSQL extension for fast, multilingual full-text search. + Unlike vector search, this store works with plain text queries — no embeddings needed. + + Prerequisites: + - A Supabase project with PGroonga extension enabled. + - Enable PGroonga in your Supabase project by running: + `CREATE EXTENSION IF NOT EXISTS pgroonga;` + + Example usage: + + ```python + from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore + from haystack.utils import Secret + + document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_fts_documents", + ) + document_store.warm_up() + ``` + """ + + def __init__( + self, + *, + supabase_url: str, + supabase_key: Secret = Secret.from_env_var("SUPABASE_SERVICE_KEY", strict=False), + table_name: str = "haystack_groonga_documents", + recreate_table: bool = False, + ) -> None: + """ + Creates a new SupabaseGroongaDocumentStore instance. + + Note: Call warm_up() before using the store to initialize the client and table. + + :param supabase_url: The URL of your Supabase project. + Format: `https://.supabase.co` + :param supabase_key: The service role key for your Supabase project. + Defaults to reading from the `SUPABASE_SERVICE_KEY` environment variable. + :param table_name: The name of the table to store documents in. + Defaults to `haystack_groonga_documents`. + :param recreate_table: Whether to drop and recreate the table on startup. + Defaults to `False`. + """ + if not re.fullmatch(r"[a-zA-Z_][a-zA-Z0-9_]*", table_name): + msg = f"Invalid table_name {table_name!r}: must match [a-zA-Z_][a-zA-Z0-9_]*" + raise ValueError(msg) + + self.supabase_url = supabase_url + self.supabase_key = supabase_key + self.table_name = table_name + self.recreate_table = recreate_table + + # Client is initialized lazily in warm_up() + self._client: Client | None = None + + def warm_up(self) -> None: + """ + Initializes the Supabase client and sets up the table. + + Must be called before using the document store. + """ + key = self.supabase_key.resolve_value() or "" + self._client = create_client(self.supabase_url, key) + self._setup_table() + + def _setup_table(self) -> None: + """ + Creates the documents table with PGroonga index if it does not exist. + + If recreate_table is True, drops and recreates the table. + """ + if self._client is None: + msg = "Call warm_up() before using the document store." + raise RuntimeError(msg) + + if self.recreate_table: + self._client.rpc("exec_sql", {"query": f"DROP TABLE IF EXISTS {self.table_name};"}).execute() + + # Create table if not exists + create_table_sql = f""" + CREATE TABLE IF NOT EXISTS {self.table_name} ( + id TEXT PRIMARY KEY, + content TEXT, + meta JSONB, + score REAL + ); + """ + self._client.rpc("exec_sql", {"query": create_table_sql}).execute() + + # Create PGroonga index on content column + create_index_sql = f""" + CREATE INDEX IF NOT EXISTS pgroonga_{self.table_name}_index + ON {self.table_name} + USING pgroonga (content); + """ + self._client.rpc("exec_sql", {"query": create_index_sql}).execute() + + def count_documents(self) -> int: + """ + Returns the number of documents in the store. + + :returns: Number of documents. + """ + if self._client is None: + msg = "Call warm_up() before using the document store." + raise RuntimeError(msg) + result = self._client.table(self.table_name).select("*", count=CountMethod.exact).execute() + return int(result.count) if result.count is not None else 0 + + def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Document]: + """ + Returns documents matching the given filters. + + Supports the standard Haystack filter syntax with the following operators: + + - Comparison: ``==``, ``!=``, ``>``, ``>=``, ``<``, ``<=``, ``in``, ``not in`` + - Logical: ``AND``, ``OR``, ``NOT`` (``OR`` and ``NOT`` support simple conditions + only — no nested logical operators inside them) + + **Known limitation:** For ``!=`` and ``not in`` on ``meta.*`` fields, documents + where the field is absent are included in the result (matching Python ``None != value`` + semantics). For ``>`` / ``>=`` / ``<`` / ``<=``, documents where the field is absent + are excluded (SQL ``NULL`` comparison semantics). + + :param filters: Optional Haystack filter dict. + Simple comparison: ``{"field": "meta.language", "operator": "==", "value": "en"}`` + Logical: ``{"operator": "AND", "conditions": [...]}`` + :returns: List of matching Document objects. + :raises FilterError: If the filter structure is malformed or uses an unsupported operator. + """ + if self._client is None: + msg = "Call warm_up() before using the document store." + raise RuntimeError(msg) + + query = self._client.table(self.table_name).select("*") + + if filters: + query = SupabaseGroongaDocumentStore._apply_filters(query, filters) + + result = query.execute() + return [self._to_haystack_document(row) for row in result.data if isinstance(row, dict)] + + @staticmethod + def _meta_col(field: str, value: Any) -> str: + """ + Choose the PostgREST column expression for a meta field. + + Uses the JSONB accessor (->) for numeric values so that PostgREST performs + correct numeric comparison. Uses the text accessor (->>) for strings, booleans, + None, and mixed lists, which return the JSON value as text. + """ + if not field.startswith("meta."): + return field + key = field[len("meta.") :] + if isinstance(value, list): + all_numeric = all(isinstance(v, (int, float)) and not isinstance(v, bool) for v in value if v is not None) + return f"meta->{key}" if (all_numeric and value) else f"meta->>{key}" + if isinstance(value, (int, float)) and not isinstance(value, bool): + return f"meta->{key}" + return f"meta->>{key}" + + @staticmethod + def _normalize_value(value: Any) -> Any: + """Convert Python booleans to lowercase strings compatible with JSONB text accessor.""" + if isinstance(value, bool): + return "true" if value else "false" + return value + + @staticmethod + def _apply_filters(query: Any, filters: dict[str, Any]) -> Any: + """ + Applies Haystack filters to a PostgREST query builder. + + Supports AND, OR, NOT logical operators and all standard comparison operators. + OR and NOT are supported for simple (non-nested) conditions only. + + :param query: The Supabase query builder. + :param filters: Haystack filter dict. + :returns: The query with filters applied. + :raises FilterError: For unsupported operators, invalid value types, or malformed filters. + """ + if not filters: + return query + + if "field" in filters: + return SupabaseGroongaDocumentStore._apply_condition(query, filters) + + if "operator" not in filters: + msg = "Logical filter must include an 'operator' key ('AND', 'OR', 'NOT')." + raise FilterError(msg) + + if "conditions" not in filters: + msg = "Logical filter must include a 'conditions' key." + raise FilterError(msg) + + op = filters["operator"] + conditions = filters["conditions"] + + if op == "AND": + for cond in conditions: + query = SupabaseGroongaDocumentStore._apply_filters(query, cond) + return query + + if op == "OR": + pg_op_map = {"==": "eq", "!=": "neq", ">": "gt", ">=": "gte", "<": "lt", "<=": "lte"} + parts = [] + for cond in conditions: + if "field" not in cond: + msg = "Nested logical operators inside OR are not supported." + raise FilterError(msg) + cond_field = cond.get("field", "") + cond_op = cond.get("operator", "") + cond_value = cond.get("value") + if cond_op not in pg_op_map: + msg = f"Operator '{cond_op}' inside OR filter is not supported." + raise FilterError(msg) + # Use text accessor (->>): PostgREST OR strings don't support JSONB (->) expressions. + col = f"meta->>{cond_field[len('meta.') :]}" if cond_field.startswith("meta.") else cond_field + norm = SupabaseGroongaDocumentStore._normalize_value(cond_value) + parts.append(f"{col}.{pg_op_map[cond_op]}.{norm}") + return query.or_(",".join(parts)) + + if op == "NOT": + # NOT(A AND B) = NOT_A OR NOT_B, with null-inclusive semantics. + # Use text accessor: PostgREST OR strings don't support JSONB (->) expressions. + neg_map = {"==": "neq", "!=": "eq", ">": "lte", ">=": "lt", "<": "gte", "<=": "gt"} + parts = [] + for cond in conditions: + if "field" not in cond: + msg = "Nested logical operators inside NOT are not supported." + raise FilterError(msg) + cond_field = cond.get("field", "") + cond_op = cond.get("operator", "") + cond_value = cond.get("value") + if cond_op not in neg_map: + msg = f"Operator '{cond_op}' inside NOT filter is not supported." + raise FilterError(msg) + col = f"meta->>{cond_field[len('meta.') :]}" if cond_field.startswith("meta.") else cond_field + norm = SupabaseGroongaDocumentStore._normalize_value(cond_value) + parts.append(f"{col}.{neg_map[cond_op]}.{norm}") + if cond_op == "==" and cond_field.startswith("meta."): + # NOT(field==value) also covers docs where the field is absent (SQL NULL semantics) + parts.append(f"{col}.is.null") + return query.or_(",".join(parts)) + + msg = f"Filter operator '{op}' is not supported. Supported logical operators: AND, OR, NOT." + raise FilterError(msg) + + @staticmethod + def _apply_condition(query: Any, condition: dict[str, Any]) -> Any: + field: str = condition.get("field", "") + + if "operator" not in condition: + msg = "Comparison filter must include an 'operator' key." + raise FilterError(msg) + + if "value" not in condition: + msg = "Comparison filter must include a 'value' key." + raise FilterError(msg) + + op: str = condition["operator"] + value = condition["value"] + + col = SupabaseGroongaDocumentStore._meta_col(field, value) + norm = SupabaseGroongaDocumentStore._normalize_value(value) + + if op == "==": + return query.is_(col, "null") if norm is None else query.eq(col, norm) + + if op == "!=": + if norm is None: + return query.not_.is_(col, "null") + if field.startswith("meta."): + # SQL: NULL != value returns NULL (not TRUE), so include docs where the field is absent. + key = field[len("meta.") :] + return query.or_(f"{col}.neq.{norm},meta->>{key}.is.null") + return query.neq(col, norm) + + if op in (">", ">=", "<", "<="): + if isinstance(value, list): + msg = f"Filter operator '{op}' does not support list values." + raise FilterError(msg) + if value is None: + return query.eq("id", "") + if isinstance(value, str): + try: + _datetime.fromisoformat(value) + except ValueError as err: + msg = f"Filter operator '{op}' does not support string values. Use a numeric or ISO date value." + raise FilterError(msg) from err + if op == ">": + return query.gt(col, norm) + if op == ">=": + return query.gte(col, norm) + if op == "<": + return query.lt(col, norm) + return query.lte(col, norm) + + if op == "in": + if not isinstance(value, list): + msg = "Filter operator 'in' requires a list value." + raise FilterError(msg) + return query.in_(col, value) + + if op == "not in": + if not isinstance(value, list): + msg = "Filter operator 'not in' requires a list value." + raise FilterError(msg) + if field.startswith("meta."): + # SQL: NULL NOT IN (...) returns NULL, so include docs where the field is absent. + key = field[len("meta.") :] + non_none = [v for v in value if v is not None] + vals = ",".join(str(v) for v in non_none) + return query.or_(f"{col}.not.in.({vals}),meta->>{key}.is.null") + return query.not_.in_(col, value) + + return query + + def write_documents( + self, + documents: list[Document], + policy: DuplicatePolicy = DuplicatePolicy.FAIL, + ) -> int: + """ + Writes documents to the store. + + :param documents: List of Haystack Document objects to write. + :param policy: How to handle duplicate documents. Defaults to DuplicatePolicy.FAIL. + :returns: Number of documents written. + """ + if not isinstance(documents, list): + msg = f"write_documents() expects a list of Document objects, got {type(documents).__name__}" + raise ValueError(msg) + for doc in documents: + if not isinstance(doc, Document): + msg = f"write_documents() expects Document objects, got {type(doc).__name__}" + raise ValueError(msg) + + if self._client is None: + msg = "Call warm_up() before using the document store." + raise RuntimeError(msg) + + if not documents: + return 0 + + written = 0 + for doc in documents: + row = { + "id": doc.id, + "content": doc.content or "", + "meta": doc.meta or {}, + "score": None, + } + if policy == DuplicatePolicy.OVERWRITE: + self._client.table(self.table_name).upsert(row).execute() + written += 1 + elif policy == DuplicatePolicy.SKIP: + existing = self._client.table(self.table_name).select("id").eq("id", doc.id).execute() + if not existing.data: + self._client.table(self.table_name).insert(row).execute() + written += 1 + elif policy == DuplicatePolicy.FAIL: + existing = self._client.table(self.table_name).select("id").eq("id", doc.id).execute() + if existing.data: + msg = f"Document with id {doc.id!r} already exists." + raise DuplicateDocumentError(msg) + self._client.table(self.table_name).insert(row).execute() + written += 1 + else: + self._client.table(self.table_name).insert(row).execute() + written += 1 + + return written + + def delete_by_filter(self, filters: dict[str, Any]) -> int: + """ + Deletes documents matching the given filters. + + :param filters: Filters to select documents for deletion. + :returns: Number of documents deleted. + """ + docs = self.filter_documents(filters=filters) + if not docs: + return 0 + self.delete_documents([doc.id for doc in docs]) + return len(docs) + + def update_by_filter(self, filters: dict[str, Any], meta: dict[str, Any]) -> int: + """ + Updates the metadata of documents matching the given filters. + + Provided meta fields are merged into the existing document metadata. + + :param filters: Filters to select documents to update. + :param meta: Metadata fields to set on matching documents. + :returns: Number of documents updated. + """ + if self._client is None: + msg = "Call warm_up() before using the document store." + raise RuntimeError(msg) + + docs = self.filter_documents(filters=filters) + if not docs: + return 0 + + for doc in docs: + row = { + "id": doc.id, + "content": doc.content or "", + "meta": {**doc.meta, **meta}, + "score": None, + } + self._client.table(self.table_name).upsert(row).execute() + + return len(docs) + + def delete_all_documents(self) -> None: + """ + Deletes all documents from the store. + """ + if self._client is None: + msg = "Call warm_up() before using the document store." + raise RuntimeError(msg) + self._client.table(self.table_name).delete().neq("id", "").execute() + + def delete_documents(self, document_ids: list[str]) -> None: + """ + Deletes documents with the given IDs. + + :param document_ids: List of document IDs to delete. + """ + if self._client is None: + msg = "Call warm_up() before using the document store." + raise RuntimeError(msg) + + if not document_ids: + return + self._client.table(self.table_name).delete().in_("id", document_ids).execute() + + def _groonga_retrieval( + self, + query: str, + top_k: int = 10, + filters: dict[str, Any] | None = None, + ) -> list[Document]: + """ + Searches documents using PGroonga full-text search. + + :param query: The text query to search for. + :param top_k: Maximum number of results to return. + :param filters: Optional filters to apply after retrieval. + :returns: List of matching Document objects ranked by relevance. + """ + if self._client is None: + msg = "Call warm_up() before using the document store." + raise RuntimeError(msg) + + result = self._client.rpc( + "groonga_search", + {"query_text": query, "table_name": self.table_name, "top_k": top_k}, + ).execute() + + data = result.data if isinstance(result.data, list) else [] + documents = [self._to_haystack_document(row) for row in data if isinstance(row, dict)] + + # Apply filters post-retrieval if provided + if filters: + documents = SupabaseGroongaDocumentStore._filter_documents_in_memory(documents, filters) + + return documents + + @staticmethod + def _filter_documents_in_memory(documents: list[Document], filters: dict[str, Any]) -> list[Document]: + """ + Filters a list of documents in memory based on the given filters. + + :param documents: List of documents to filter. + :param filters: Dictionary of filters to apply. + :returns: Filtered list of documents. + """ + conditions = filters.get("conditions", []) + filtered = [] + + for doc in documents: + match = True + for condition in conditions: + field = condition.get("field", "") + op = condition.get("operator", "==") + value = condition.get("value") + + if field.startswith("meta."): + meta_key = field[len("meta.") :] + doc_value = doc.meta.get(meta_key) + else: + doc_value = getattr(doc, field, None) + + if op == "==" and doc_value != value: + match = False + break + elif op == "!=" and doc_value == value: + match = False + break + elif op == "in" and doc_value not in value: + match = False + break + + if match: + filtered.append(doc) + + return filtered + + def _to_haystack_document(self, row: dict[str, Any]) -> Document: + """ + Converts a database row dictionary into a Haystack Document. + + :param row: Dictionary from database result. + :returns: Haystack Document object. + """ + return Document( + id=row["id"], + content=row.get("content"), + meta=row.get("meta") or {}, + score=row.get("score"), + ) + + def to_dict(self) -> dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: Dictionary with serialized data. + """ + return default_to_dict( + self, + supabase_url=self.supabase_url, + supabase_key=self.supabase_key.to_dict(), + table_name=self.table_name, + recreate_table=self.recreate_table, + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "SupabaseGroongaDocumentStore": + """ + Deserializes the component from a dictionary. + + :param data: Dictionary to deserialize from. + :returns: Deserialized component. + """ + deserialize_secrets_inplace(data["init_parameters"], ["supabase_key"]) + return default_from_dict(cls, data) diff --git a/integrations/supabase/tests/test_groonga_document_store.py b/integrations/supabase/tests/test_groonga_document_store.py new file mode 100644 index 0000000000..c6b239a9ce --- /dev/null +++ b/integrations/supabase/tests/test_groonga_document_store.py @@ -0,0 +1,206 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import MagicMock, patch + +import pytest +from haystack.dataclasses import Document +from haystack.document_stores.errors import DuplicateDocumentError +from haystack.document_stores.types import DuplicatePolicy + +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore + + +@pytest.fixture +def mock_supabase_client(): + """Creates a mock Supabase client so we never hit a real database.""" + with patch("haystack_integrations.document_stores.supabase.groonga_document_store.create_client") as mock_create: + mock_client = MagicMock() + mock_create.return_value = mock_client + + mock_client.rpc.return_value.execute.return_value = MagicMock(data=[], count=0) + + mock_table = MagicMock() + mock_client.table.return_value = mock_table + mock_table.select.return_value = mock_table + mock_table.insert.return_value = mock_table + mock_table.upsert.return_value = mock_table + mock_table.delete.return_value = mock_table + mock_table.eq.return_value = mock_table + mock_table.in_.return_value = mock_table + mock_table.execute.return_value = MagicMock(data=[], count=0) + + yield mock_client + + +@pytest.fixture +def groonga_store(mock_supabase_client, monkeypatch): # noqa: ARG001 + """Creates a SupabaseGroongaDocumentStore with mocked client and calls warm_up().""" + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + store = SupabaseGroongaDocumentStore( + supabase_url="https://fake-project.supabase.co", + table_name="test_groonga_documents", + recreate_table=False, + ) + store.warm_up() + return store + + +class TestDocumentStore: + def test_init_defaults(self, monkeypatch): + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + store = SupabaseGroongaDocumentStore(supabase_url="https://fake-project.supabase.co") + assert store.table_name == "haystack_groonga_documents" + assert store.recreate_table is False + assert store.supabase_url == "https://fake-project.supabase.co" + assert store._client is None + + def test_init_custom_params(self, monkeypatch): + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + store = SupabaseGroongaDocumentStore( + supabase_url="https://fake-project.supabase.co", + table_name="my_custom_table", + recreate_table=True, + ) + assert store.table_name == "my_custom_table" + assert store.recreate_table is True + assert store._client is None + + def test_invalid_table_name_raises(self, monkeypatch): + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + with pytest.raises(ValueError, match="Invalid table_name"): + SupabaseGroongaDocumentStore( + supabase_url="https://fake-project.supabase.co", + table_name="bad-name; DROP TABLE users;", + ) + + def test_table_name_with_numbers_allowed(self, monkeypatch): + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + store = SupabaseGroongaDocumentStore( + supabase_url="https://fake-project.supabase.co", + table_name="my_table_123", + ) + assert store.table_name == "my_table_123" + + def test_warm_up_initializes_client(self, mock_supabase_client, monkeypatch): # noqa: ARG002 + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + store = SupabaseGroongaDocumentStore(supabase_url="https://fake-project.supabase.co") + assert store._client is None + store.warm_up() + assert store._client is not None + + def test_count_documents(self, groonga_store, mock_supabase_client): + mock_supabase_client.table.return_value.select.return_value.execute.return_value = MagicMock(count=5) + assert groonga_store.count_documents() == 5 + + def test_count_documents_empty(self, groonga_store, mock_supabase_client): + mock_supabase_client.table.return_value.select.return_value.execute.return_value = MagicMock(count=0) + assert groonga_store.count_documents() == 0 + + def test_write_documents(self, groonga_store, mock_supabase_client): + mock_table = mock_supabase_client.table.return_value + mock_table.select.return_value.eq.return_value.execute.return_value = MagicMock(data=[]) + mock_table.insert.return_value.execute.return_value = MagicMock(data=[{}]) + + documents = [ + Document(content="Python is great"), + Document(content="Haystack is a RAG framework"), + ] + written = groonga_store.write_documents(documents, policy=DuplicatePolicy.OVERWRITE) + assert written == 2 + + def test_write_documents_empty(self, groonga_store): + assert groonga_store.write_documents([]) == 0 + + def test_write_documents_overwrite(self, groonga_store, mock_supabase_client): + mock_table = mock_supabase_client.table.return_value + mock_table.upsert.return_value.execute.return_value = MagicMock(data=[{}]) + + written = groonga_store.write_documents([Document(content="test document")], policy=DuplicatePolicy.OVERWRITE) + assert written == 1 + mock_table.upsert.assert_called_once() + + def test_write_documents_skip_existing(self, groonga_store, mock_supabase_client): + mock_table = mock_supabase_client.table.return_value + mock_table.select.return_value.eq.return_value.execute.return_value = MagicMock(data=[{"id": "existing"}]) + + written = groonga_store.write_documents([Document(content="already exists")], policy=DuplicatePolicy.SKIP) + assert written == 0 + + def test_write_documents_fail_on_duplicate(self, groonga_store, mock_supabase_client): + mock_table = mock_supabase_client.table.return_value + mock_table.select.return_value.eq.return_value.execute.return_value = MagicMock(data=[{"id": "existing"}]) + + with pytest.raises(DuplicateDocumentError): + groonga_store.write_documents([Document(content="duplicate doc")], policy=DuplicatePolicy.FAIL) + + def test_delete_all_documents(self, groonga_store, mock_supabase_client): + mock_table = mock_supabase_client.table.return_value + mock_table.delete.return_value.neq.return_value.execute.return_value = MagicMock(data=[]) + + groonga_store.delete_all_documents() + mock_table.delete.assert_called_once() + + def test_delete_documents(self, groonga_store, mock_supabase_client): + mock_table = mock_supabase_client.table.return_value + mock_table.delete.return_value.in_.return_value.execute.return_value = MagicMock(data=[]) + + groonga_store.delete_documents(["id1", "id2"]) + mock_table.delete.assert_called_once() + + def test_delete_documents_empty(self, groonga_store, mock_supabase_client): + groonga_store.delete_documents([]) + mock_supabase_client.table.return_value.delete.assert_not_called() + + def test_filter_documents(self, groonga_store, mock_supabase_client): + mock_supabase_client.table.return_value.select.return_value.execute.return_value = MagicMock( + data=[ + {"id": "1", "content": "Python is great", "meta": {}, "score": None}, + {"id": "2", "content": "Haystack rocks", "meta": {}, "score": None}, + ] + ) + docs = groonga_store.filter_documents() + assert len(docs) == 2 + assert docs[0].content == "Python is great" + assert docs[1].content == "Haystack rocks" + + def test_filter_documents_with_filters(self, groonga_store, mock_supabase_client): + mock_table = mock_supabase_client.table.return_value + mock_table.select.return_value.eq.return_value.execute.return_value = MagicMock( + data=[{"id": "1", "content": "Python is great", "meta": {"language": "en"}, "score": None}] + ) + filters = {"operator": "AND", "conditions": [{"field": "meta.language", "operator": "==", "value": "en"}]} + docs = groonga_store.filter_documents(filters=filters) + assert len(docs) == 1 + + def test_to_dict(self, groonga_store): + result = groonga_store.to_dict() + assert result["type"] == ( + "haystack_integrations.document_stores.supabase.groonga_document_store.SupabaseGroongaDocumentStore" + ) + assert result["init_parameters"]["table_name"] == "test_groonga_documents" + assert result["init_parameters"]["supabase_url"] == "https://fake-project.supabase.co" + assert result["init_parameters"]["recreate_table"] is False + + def test_from_dict(self, mock_supabase_client, monkeypatch): # noqa: ARG002 + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + data = { + "type": ( + "haystack_integrations.document_stores.supabase.groonga_document_store.SupabaseGroongaDocumentStore" + ), + "init_parameters": { + "supabase_url": "https://fake-project.supabase.co", + "supabase_key": { + "type": "env_var", + "env_vars": ["SUPABASE_SERVICE_KEY"], + "strict": True, + }, + "table_name": "test_groonga_documents", + "recreate_table": False, + }, + } + store = SupabaseGroongaDocumentStore.from_dict(data) + assert store.table_name == "test_groonga_documents" + assert store.supabase_url == "https://fake-project.supabase.co" + assert store._client is None diff --git a/integrations/supabase/tests/test_groonga_integration.py b/integrations/supabase/tests/test_groonga_integration.py new file mode 100644 index 0000000000..9d0fdf5d3e --- /dev/null +++ b/integrations/supabase/tests/test_groonga_integration.py @@ -0,0 +1,142 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +# Integration tests for SupabaseGroongaDocumentStore and SupabaseGroongaBM25Retriever. +# +# These tests require a running stack of three Docker containers defined in +# docker-compose-groonga.yml: +# +# pgroonga-postgres PostgreSQL 17 + PGroonga extension (port 5433) +# postgrest PostgREST REST API on top of PostgreSQL (port 3000) +# nginx Reverse proxy that strips the /rest/v1/ prefix that +# supabase-py always appends, then forwards to PostgREST +# (port 8000 — the URL used by supabase-py) +# +# Start the stack locally with: +# make docker-groonga +# +# The test fixture falls back to http://localhost:8000 when SUPABASE_URL is not +# set, so no environment variables are required for local development. +# +# All tests share a single pre-created table (haystack_groonga_test) defined in +# init-pgroonga.sql. PostgREST caches its schema at startup and does not reload +# it for tables created later, so the table must exist before PostgREST starts. +# Data is cleared in fixture teardown instead of recreating the table. + +import os + +import pytest +from haystack.dataclasses import Document +from haystack.document_stores.types import DuplicatePolicy +from haystack.testing.document_store import ( + DeleteAllTest, + DeleteByFilterTest, + DocumentStoreBaseTests, + FilterableDocsFixtureMixin, + UpdateByFilterTest, +) +from haystack.utils import Secret + +from haystack_integrations.components.retrievers.supabase import SupabaseGroongaBM25Retriever +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore + +# Defaults for the local Docker stack (docker-compose-groonga.yml). +# PostgREST is configured without a JWT secret, so the key is not validated. +_LOCAL_SUPABASE_URL = "http://localhost:8000" +_LOCAL_SERVICE_KEY = "local-dev-key-not-validated" + + +def _make_store(request: pytest.FixtureRequest) -> SupabaseGroongaDocumentStore: # noqa: ARG001 + supabase_url = os.environ.get("SUPABASE_URL", _LOCAL_SUPABASE_URL) + service_key = os.environ.get("SUPABASE_SERVICE_KEY", _LOCAL_SERVICE_KEY) + store = SupabaseGroongaDocumentStore( + supabase_url=supabase_url, + supabase_key=Secret.from_token(service_key), + # Fixed table pre-created in init-pgroonga.sql so PostgREST knows about it at startup. + # Tests clear data in teardown instead of recreating the table. + table_name="haystack_groonga_test", + recreate_table=False, + ) + store.warm_up() + return store + + +@pytest.mark.integration +class TestSupabaseGroongaDocumentStoreIntegration( + DocumentStoreBaseTests, + DeleteAllTest, + DeleteByFilterTest, + FilterableDocsFixtureMixin, + UpdateByFilterTest, +): + @pytest.fixture + def document_store(self, request): + store = _make_store(request) + yield store + store.delete_all_documents() + + @staticmethod + def assert_documents_are_equal(received: list[Document], expected: list[Document]) -> None: + # Embeddings are not stored; strip them and sort by id for order-independent comparison. + def normalize(doc: Document) -> Document: + return Document(id=doc.id, content=doc.content, meta=doc.meta) + + assert sorted([normalize(d) for d in received], key=lambda d: d.id or "") == sorted( + [normalize(d) for d in expected], key=lambda d: d.id or "" + ) + + def test_write_documents(self, document_store: SupabaseGroongaDocumentStore) -> None: + docs = [ + Document(content="First document", meta={"key": "val"}), + Document(content="Second document"), + ] + assert document_store.write_documents(docs, DuplicatePolicy.FAIL) == len(docs) + result = document_store.filter_documents() + self.assert_documents_are_equal(result, docs) + + +@pytest.mark.integration +class TestGroongaRetriever: + @pytest.fixture + def document_store(self, request): + store = _make_store(request) + yield store + store.delete_all_documents() + + def test_groonga_retrieval(self, document_store): + docs = [ + Document(content="Python is a great programming language"), + Document(content="Haystack is built for RAG pipelines"), + Document(content="Supabase is a great backend platform"), + ] + document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + results = document_store._groonga_retrieval(query="Python", top_k=5) + assert len(results) >= 1 + assert any("Python" in doc.content for doc in results) + + def test_groonga_retrieval_top_k(self, document_store): + docs = [Document(content=f"document about python number {i}") for i in range(5)] + document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + results = document_store._groonga_retrieval(query="python", top_k=2) + assert len(results) <= 2 + + def test_retriever_run(self, document_store): + docs = [ + Document(content="Python programming is fun"), + Document(content="Java is also popular"), + ] + document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + retriever = SupabaseGroongaBM25Retriever(document_store=document_store, top_k=5) + result = retriever.run(query="Python") + + assert "documents" in result + assert len(result["documents"]) >= 1 + assert any("Python" in doc.content for doc in result["documents"]) + + def test_retriever_empty_query(self, document_store): + retriever = SupabaseGroongaBM25Retriever(document_store=document_store) + assert retriever.run(query="") == {"documents": []} diff --git a/integrations/supabase/tests/test_groonga_retriever.py b/integrations/supabase/tests/test_groonga_retriever.py new file mode 100644 index 0000000000..21102008b2 --- /dev/null +++ b/integrations/supabase/tests/test_groonga_retriever.py @@ -0,0 +1,124 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import MagicMock, patch + +import pytest + +from haystack_integrations.components.retrievers.supabase import SupabaseGroongaBM25Retriever +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore + + +@pytest.fixture +def mock_supabase_client(): + """Creates a mock Supabase client so we never hit a real database.""" + with patch("haystack_integrations.document_stores.supabase.groonga_document_store.create_client") as mock_create: + mock_client = MagicMock() + mock_create.return_value = mock_client + + mock_client.rpc.return_value.execute.return_value = MagicMock(data=[], count=0) + + mock_table = MagicMock() + mock_client.table.return_value = mock_table + mock_table.select.return_value = mock_table + mock_table.insert.return_value = mock_table + mock_table.upsert.return_value = mock_table + mock_table.delete.return_value = mock_table + mock_table.eq.return_value = mock_table + mock_table.in_.return_value = mock_table + mock_table.execute.return_value = MagicMock(data=[], count=0) + + yield mock_client + + +@pytest.fixture +def groonga_store(mock_supabase_client, monkeypatch): # noqa: ARG001 + """Creates a SupabaseGroongaDocumentStore with mocked client and calls warm_up().""" + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + store = SupabaseGroongaDocumentStore( + supabase_url="https://fake-project.supabase.co", + table_name="test_groonga_documents", + recreate_table=False, + ) + store.warm_up() + return store + + +class TestRetriever: + def test_init_invalid_store(self): + with pytest.raises(ValueError, match="document_store must be an instance"): + SupabaseGroongaBM25Retriever(document_store="not_a_store") + + def test_init(self, groonga_store): + retriever = SupabaseGroongaBM25Retriever(document_store=groonga_store, top_k=5) + assert retriever.top_k == 5 + assert retriever.document_store is groonga_store + + def test_init_default_top_k(self, groonga_store): + retriever = SupabaseGroongaBM25Retriever(document_store=groonga_store) + assert retriever.top_k == 10 + + def test_run_empty_query(self, groonga_store): + retriever = SupabaseGroongaBM25Retriever(document_store=groonga_store) + assert retriever.run(query="") == {"documents": []} + + def test_run(self, groonga_store, mock_supabase_client): + mock_supabase_client.rpc.return_value.execute.return_value = MagicMock( + data=[{"id": "1", "content": "Python is great", "meta": {}, "score": 1.0}] + ) + retriever = SupabaseGroongaBM25Retriever(document_store=groonga_store, top_k=5) + result = retriever.run(query="Python") + assert len(result["documents"]) == 1 + assert result["documents"][0].content == "Python is great" + + @pytest.mark.asyncio + async def test_run_async(self, groonga_store, mock_supabase_client): + mock_supabase_client.rpc.return_value.execute.return_value = MagicMock( + data=[{"id": "1", "content": "Python is great", "meta": {}, "score": 1.0}] + ) + retriever = SupabaseGroongaBM25Retriever(document_store=groonga_store, top_k=5) + result = await retriever.run_async(query="Python") + assert len(result["documents"]) == 1 + + @pytest.mark.asyncio + async def test_run_async_empty_query(self, groonga_store): + retriever = SupabaseGroongaBM25Retriever(document_store=groonga_store) + assert await retriever.run_async(query="") == {"documents": []} + + def test_to_dict(self, groonga_store): + retriever = SupabaseGroongaBM25Retriever(document_store=groonga_store, top_k=5) + result = retriever.to_dict() + assert result["init_parameters"]["top_k"] == 5 + assert "document_store" in result["init_parameters"] + + def test_from_dict(self, mock_supabase_client, monkeypatch): # noqa: ARG002 + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "fake-test-key") + data = { + "type": ( + "haystack_integrations.components.retrievers.supabase.groonga_bm25_retriever.SupabaseGroongaBM25Retriever" + ), + "init_parameters": { + "top_k": 7, + "filters": {}, + "filter_policy": "replace", + "document_store": { + "type": ( + "haystack_integrations.document_stores.supabase" + ".groonga_document_store.SupabaseGroongaDocumentStore" + ), + "init_parameters": { + "supabase_url": "https://fake-project.supabase.co", + "supabase_key": { + "type": "env_var", + "env_vars": ["SUPABASE_SERVICE_KEY"], + "strict": True, + }, + "table_name": "test_groonga_documents", + "recreate_table": False, + }, + }, + }, + } + retriever = SupabaseGroongaBM25Retriever.from_dict(data) + assert retriever.top_k == 7