From 172f8978697f344194bbc218263cb81e30e1a2e2 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 5 Jan 2026 15:17:22 +0100 Subject: [PATCH 01/58] fixed metadata merging to properly update the meta key --- .../opensearch/document_store.py | 190 +++++++++++------- .../opensearch/tests/test_document_store.py | 26 +++ .../tests/test_document_store_async.py | 32 +++ 3 files changed, 176 insertions(+), 72 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 541466326d..b9c45e690e 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -64,25 +64,25 @@ class OpenSearchDocumentStore: """ def __init__( - self, - *, - hosts: Optional[Hosts] = None, - index: str = "default", - max_chunk_bytes: int = DEFAULT_MAX_CHUNK_BYTES, - embedding_dim: int = 768, - return_embedding: bool = False, - method: Optional[dict[str, Any]] = None, - mappings: Optional[dict[str, Any]] = None, - settings: Optional[dict[str, Any]] = DEFAULT_SETTINGS, - create_index: bool = True, - http_auth: Any = ( - Secret.from_env_var("OPENSEARCH_USERNAME", strict=False), # noqa: B008 - Secret.from_env_var("OPENSEARCH_PASSWORD", strict=False), # noqa: B008 - ), - use_ssl: Optional[bool] = None, - verify_certs: Optional[bool] = None, - timeout: Optional[int] = None, - **kwargs: Any, + self, + *, + hosts: Optional[Hosts] = None, + index: str = "default", + max_chunk_bytes: int = DEFAULT_MAX_CHUNK_BYTES, + embedding_dim: int = 768, + return_embedding: bool = False, + method: Optional[dict[str, Any]] = None, + mappings: Optional[dict[str, Any]] = None, + settings: Optional[dict[str, Any]] = DEFAULT_SETTINGS, + create_index: bool = True, + http_auth: Any = ( + Secret.from_env_var("OPENSEARCH_USERNAME", strict=False), # noqa: B008 + Secret.from_env_var("OPENSEARCH_PASSWORD", strict=False), # noqa: B008 + ), + use_ssl: Optional[bool] = None, + verify_certs: Optional[bool] = None, + timeout: Optional[int] = None, + **kwargs: Any, ) -> None: """ Creates a new OpenSearchDocumentStore instance. @@ -174,10 +174,10 @@ def _get_default_mappings(self) -> dict[str, Any]: return default_mappings def create_index( - self, - index: Optional[str] = None, - mappings: Optional[dict[str, Any]] = None, - settings: Optional[dict[str, Any]] = None, + self, + index: Optional[str] = None, + mappings: Optional[dict[str, Any]] = None, + settings: Optional[dict[str, Any]] = None, ) -> None: """ Creates an index in OpenSearch. @@ -399,7 +399,7 @@ async def filter_documents_async(self, filters: Optional[dict[str, Any]] = None) return await self._search_documents_async(self._prepare_filter_search_request(filters)) def _prepare_bulk_write_request( - self, *, documents: list[Document], policy: DuplicatePolicy, is_async: bool + self, *, documents: list[Document], policy: DuplicatePolicy, is_async: bool ) -> dict[str, Any]: if len(documents) > 0 and not isinstance(documents[0], Document): msg = "param 'documents' must contain a list of objects of type Document" @@ -487,7 +487,7 @@ def write_documents(self, documents: list[Document], policy: DuplicatePolicy = D return documents_written async def write_documents_async( - self, documents: list[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE + self, documents: list[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE ) -> int: """ Asynchronously writes documents to the document store. @@ -757,14 +757,14 @@ async def update_by_filter_async(self, filters: dict[str, Any], meta: dict[str, raise DocumentStoreError(msg) from e def _prepare_bm25_search_request( - self, - *, - query: str, - filters: Optional[dict[str, Any]], - fuzziness: Union[int, str], - top_k: int, - all_terms_must_match: bool, - custom_query: Optional[dict[str, Any]], + self, + *, + query: str, + filters: Optional[dict[str, Any]], + fuzziness: Union[int, str], + top_k: int, + all_terms_must_match: bool, + custom_query: Optional[dict[str, Any]], ) -> dict[str, Any]: if not query: body: dict[str, Any] = {"query": {"bool": {"must": {"match_all": {}}}}} @@ -822,15 +822,15 @@ def _postprocess_bm25_search_results(*, results: list[Document], scale_score: bo doc.score = float(1 / (1 + exp(-(doc.score / float(BM25_SCALING_FACTOR))))) def _bm25_retrieval( - self, - query: str, - *, - filters: Optional[dict[str, Any]] = None, - fuzziness: Union[int, str] = "AUTO", - top_k: int = 10, - scale_score: bool = False, - all_terms_must_match: bool = False, - custom_query: Optional[dict[str, Any]] = None, + self, + query: str, + *, + filters: Optional[dict[str, Any]] = None, + fuzziness: Union[int, str] = "AUTO", + top_k: int = 10, + scale_score: bool = False, + all_terms_must_match: bool = False, + custom_query: Optional[dict[str, Any]] = None, ) -> list[Document]: """ Retrieves documents that match the provided `query` using the BM25 search algorithm. @@ -860,15 +860,15 @@ def _bm25_retrieval( return documents async def _bm25_retrieval_async( - self, - query: str, - *, - filters: Optional[dict[str, Any]] = None, - fuzziness: str = "AUTO", - top_k: int = 10, - scale_score: bool = False, - all_terms_must_match: bool = False, - custom_query: Optional[dict[str, Any]] = None, + self, + query: str, + *, + filters: Optional[dict[str, Any]] = None, + fuzziness: str = "AUTO", + top_k: int = 10, + scale_score: bool = False, + all_terms_must_match: bool = False, + custom_query: Optional[dict[str, Any]] = None, ) -> list[Document]: """ Asynchronously retrieves documents that match the provided `query` using the BM25 search algorithm. @@ -900,13 +900,13 @@ async def _bm25_retrieval_async( return documents def _prepare_embedding_search_request( - self, - *, - query_embedding: list[float], - filters: Optional[dict[str, Any]], - top_k: int, - custom_query: Optional[dict[str, Any]], - efficient_filtering: bool = False, + self, + *, + query_embedding: list[float], + filters: Optional[dict[str, Any]], + top_k: int, + custom_query: Optional[dict[str, Any]], + efficient_filtering: bool = False, ) -> dict[str, Any]: if not query_embedding: msg = "query_embedding must be a non-empty list of floats" @@ -956,13 +956,13 @@ def _prepare_embedding_search_request( return body def _embedding_retrieval( - self, - query_embedding: list[float], - *, - filters: Optional[dict[str, Any]] = None, - top_k: int = 10, - custom_query: Optional[dict[str, Any]] = None, - efficient_filtering: bool = False, + self, + query_embedding: list[float], + *, + filters: Optional[dict[str, Any]] = None, + top_k: int = 10, + custom_query: Optional[dict[str, Any]] = None, + efficient_filtering: bool = False, ) -> list[Document]: """ Retrieves documents that are most similar to the query embedding using a vector similarity metric. @@ -986,13 +986,13 @@ def _embedding_retrieval( return self._search_documents(search_params) async def _embedding_retrieval_async( - self, - query_embedding: list[float], - *, - filters: Optional[dict[str, Any]] = None, - top_k: int = 10, - custom_query: Optional[dict[str, Any]] = None, - efficient_filtering: bool = False, + self, + query_embedding: list[float], + *, + filters: Optional[dict[str, Any]] = None, + top_k: int = 10, + custom_query: Optional[dict[str, Any]] = None, + efficient_filtering: bool = False, ) -> list[Document]: """ Asynchronously retrieves documents that are most similar to the query embedding using a vector similarity @@ -1032,3 +1032,49 @@ def _render_custom_query(self, custom_query: Any, substitutions: dict[str, Any]) return substitutions.get(custom_query, custom_query) return custom_query + + def count_documents_by_filter(self, filters: dict) -> int: + """ + Returns the number of documents that match the provided filters. + + :param filters: The filters to apply to count documents. + For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) + :returns: The number of documents that match the filters. + """ + self._ensure_initialized() + assert self._client is not None + + normalized_filters = normalize_filters(filters) + body = {"query": {"bool": {"filter": normalized_filters}}} + return self._client.count(index=self._index, body=body)["count"] + + async def count_documents_by_filter_async(self, filters: dict) -> int: + """ + Asynchronously returns the number of documents that match the provided filters. + + :param filters: The filters to apply to count documents. + For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) + :returns: The number of documents that match the filters. + """ + await self._ensure_initialized_async() + assert self._async_client is not None + + normalized_filters = normalize_filters(filters) + body = {"query": {"bool": {"filter": normalized_filters}}} + return (await self._async_client.count(index=self._index, body=body))["count"] + + def count_distinct_values_by_filter(self, filters: dict) -> dict[str, int]: + pass + + def get_fields_info(self) -> dict[str, dict]: + pass + + def get_field_min_max(self, metadata_field: str) -> dict[str, Any]: + pass + + def get_field_unique_values( + self, metadata_field: str, search_term: str | None, from_: int, size: int)-> tuple[list[str], int]: + pass + + def query_sql(self, query: str): + pass diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index d74cdbaa80..c35a9449b2 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -576,3 +576,29 @@ def test_update_by_filter(self, document_store: OpenSearchDocumentStore): ) assert len(draft_docs) == 1 assert draft_docs[0].meta["category"] == "B" + + def test_count_documents_by_filter(self, document_store: OpenSearchDocumentStore): + docs = [ + Document(content="Doc 1", meta={"category": "A", "status": "active"}), + Document(content="Doc 2", meta={"category": "B", "status": "active"}), + Document(content="Doc 3", meta={"category": "A", "status": "inactive"}), + Document(content="Doc 4", meta={"category": "A", "status": "active"}), + ] + document_store.write_documents(docs) + assert document_store.count_documents() == 4 + + count_a = document_store.count_documents_by_filter( + filters={"field": "meta.category", "operator": "==", "value": "A"} + ) + assert count_a == 3 + + count_a_active = document_store.count_documents_by_filter( + filters={ + "operator": "AND", + "conditions": [ + {"field": "meta.category", "operator": "==", "value": "A"}, + {"field": "meta.status", "operator": "==", "value": "active"}, + ], + } + ) + assert count_a_active == 2 diff --git a/integrations/opensearch/tests/test_document_store_async.py b/integrations/opensearch/tests/test_document_store_async.py index 95444dae4d..783bfa6d11 100644 --- a/integrations/opensearch/tests/test_document_store_async.py +++ b/integrations/opensearch/tests/test_document_store_async.py @@ -238,6 +238,38 @@ async def test_filter_documents(self, document_store: OpenSearchDocumentStore): assert result[0].content == "2" assert result[0].meta["number"] == 100 + @pytest.mark.asyncio + async def test_count_documents_by_filter(self, document_store: OpenSearchDocumentStore): + filterable_docs = [ + Document(content="Doc 1", meta={"category": "A", "status": "active"}), + Document(content="Doc 2", meta={"category": "B", "status": "active"}), + Document(content="Doc 3", meta={"category": "A", "status": "inactive"}), + Document(content="Doc 4", meta={"category": "A", "status": "active"}), + ] + await document_store.write_documents_async(filterable_docs) + assert await document_store.count_documents_async() == 4 + + count_a = await document_store.count_documents_by_filter_async( + filters={"field": "meta.category", "operator": "==", "value": "A"} + ) + assert count_a == 3 + + count_active = await document_store.count_documents_by_filter_async( + filters={"field": "meta.status", "operator": "==", "value": "active"} + ) + assert count_active == 3 + + count_a_active = await document_store.count_documents_by_filter_async( + filters={ + "operator": "AND", + "conditions": [ + {"field": "meta.category", "operator": "==", "value": "A"}, + {"field": "meta.status", "operator": "==", "value": "active"}, + ], + } + ) + assert count_a_active == 2 + @pytest.mark.asyncio async def test_delete_documents(self, document_store: OpenSearchDocumentStore): doc = Document(content="test doc") From 842da6a86acab107a1518118c67391e586b45532 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 5 Jan 2026 15:18:13 +0100 Subject: [PATCH 02/58] formmatting --- .../opensearch/document_store.py | 147 +++++++++--------- 1 file changed, 74 insertions(+), 73 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index b9c45e690e..53a81b182d 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -64,25 +64,25 @@ class OpenSearchDocumentStore: """ def __init__( - self, - *, - hosts: Optional[Hosts] = None, - index: str = "default", - max_chunk_bytes: int = DEFAULT_MAX_CHUNK_BYTES, - embedding_dim: int = 768, - return_embedding: bool = False, - method: Optional[dict[str, Any]] = None, - mappings: Optional[dict[str, Any]] = None, - settings: Optional[dict[str, Any]] = DEFAULT_SETTINGS, - create_index: bool = True, - http_auth: Any = ( - Secret.from_env_var("OPENSEARCH_USERNAME", strict=False), # noqa: B008 - Secret.from_env_var("OPENSEARCH_PASSWORD", strict=False), # noqa: B008 - ), - use_ssl: Optional[bool] = None, - verify_certs: Optional[bool] = None, - timeout: Optional[int] = None, - **kwargs: Any, + self, + *, + hosts: Optional[Hosts] = None, + index: str = "default", + max_chunk_bytes: int = DEFAULT_MAX_CHUNK_BYTES, + embedding_dim: int = 768, + return_embedding: bool = False, + method: Optional[dict[str, Any]] = None, + mappings: Optional[dict[str, Any]] = None, + settings: Optional[dict[str, Any]] = DEFAULT_SETTINGS, + create_index: bool = True, + http_auth: Any = ( + Secret.from_env_var("OPENSEARCH_USERNAME", strict=False), # noqa: B008 + Secret.from_env_var("OPENSEARCH_PASSWORD", strict=False), # noqa: B008 + ), + use_ssl: Optional[bool] = None, + verify_certs: Optional[bool] = None, + timeout: Optional[int] = None, + **kwargs: Any, ) -> None: """ Creates a new OpenSearchDocumentStore instance. @@ -174,10 +174,10 @@ def _get_default_mappings(self) -> dict[str, Any]: return default_mappings def create_index( - self, - index: Optional[str] = None, - mappings: Optional[dict[str, Any]] = None, - settings: Optional[dict[str, Any]] = None, + self, + index: Optional[str] = None, + mappings: Optional[dict[str, Any]] = None, + settings: Optional[dict[str, Any]] = None, ) -> None: """ Creates an index in OpenSearch. @@ -399,7 +399,7 @@ async def filter_documents_async(self, filters: Optional[dict[str, Any]] = None) return await self._search_documents_async(self._prepare_filter_search_request(filters)) def _prepare_bulk_write_request( - self, *, documents: list[Document], policy: DuplicatePolicy, is_async: bool + self, *, documents: list[Document], policy: DuplicatePolicy, is_async: bool ) -> dict[str, Any]: if len(documents) > 0 and not isinstance(documents[0], Document): msg = "param 'documents' must contain a list of objects of type Document" @@ -487,7 +487,7 @@ def write_documents(self, documents: list[Document], policy: DuplicatePolicy = D return documents_written async def write_documents_async( - self, documents: list[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE + self, documents: list[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE ) -> int: """ Asynchronously writes documents to the document store. @@ -757,14 +757,14 @@ async def update_by_filter_async(self, filters: dict[str, Any], meta: dict[str, raise DocumentStoreError(msg) from e def _prepare_bm25_search_request( - self, - *, - query: str, - filters: Optional[dict[str, Any]], - fuzziness: Union[int, str], - top_k: int, - all_terms_must_match: bool, - custom_query: Optional[dict[str, Any]], + self, + *, + query: str, + filters: Optional[dict[str, Any]], + fuzziness: Union[int, str], + top_k: int, + all_terms_must_match: bool, + custom_query: Optional[dict[str, Any]], ) -> dict[str, Any]: if not query: body: dict[str, Any] = {"query": {"bool": {"must": {"match_all": {}}}}} @@ -822,15 +822,15 @@ def _postprocess_bm25_search_results(*, results: list[Document], scale_score: bo doc.score = float(1 / (1 + exp(-(doc.score / float(BM25_SCALING_FACTOR))))) def _bm25_retrieval( - self, - query: str, - *, - filters: Optional[dict[str, Any]] = None, - fuzziness: Union[int, str] = "AUTO", - top_k: int = 10, - scale_score: bool = False, - all_terms_must_match: bool = False, - custom_query: Optional[dict[str, Any]] = None, + self, + query: str, + *, + filters: Optional[dict[str, Any]] = None, + fuzziness: Union[int, str] = "AUTO", + top_k: int = 10, + scale_score: bool = False, + all_terms_must_match: bool = False, + custom_query: Optional[dict[str, Any]] = None, ) -> list[Document]: """ Retrieves documents that match the provided `query` using the BM25 search algorithm. @@ -860,15 +860,15 @@ def _bm25_retrieval( return documents async def _bm25_retrieval_async( - self, - query: str, - *, - filters: Optional[dict[str, Any]] = None, - fuzziness: str = "AUTO", - top_k: int = 10, - scale_score: bool = False, - all_terms_must_match: bool = False, - custom_query: Optional[dict[str, Any]] = None, + self, + query: str, + *, + filters: Optional[dict[str, Any]] = None, + fuzziness: str = "AUTO", + top_k: int = 10, + scale_score: bool = False, + all_terms_must_match: bool = False, + custom_query: Optional[dict[str, Any]] = None, ) -> list[Document]: """ Asynchronously retrieves documents that match the provided `query` using the BM25 search algorithm. @@ -900,13 +900,13 @@ async def _bm25_retrieval_async( return documents def _prepare_embedding_search_request( - self, - *, - query_embedding: list[float], - filters: Optional[dict[str, Any]], - top_k: int, - custom_query: Optional[dict[str, Any]], - efficient_filtering: bool = False, + self, + *, + query_embedding: list[float], + filters: Optional[dict[str, Any]], + top_k: int, + custom_query: Optional[dict[str, Any]], + efficient_filtering: bool = False, ) -> dict[str, Any]: if not query_embedding: msg = "query_embedding must be a non-empty list of floats" @@ -956,13 +956,13 @@ def _prepare_embedding_search_request( return body def _embedding_retrieval( - self, - query_embedding: list[float], - *, - filters: Optional[dict[str, Any]] = None, - top_k: int = 10, - custom_query: Optional[dict[str, Any]] = None, - efficient_filtering: bool = False, + self, + query_embedding: list[float], + *, + filters: Optional[dict[str, Any]] = None, + top_k: int = 10, + custom_query: Optional[dict[str, Any]] = None, + efficient_filtering: bool = False, ) -> list[Document]: """ Retrieves documents that are most similar to the query embedding using a vector similarity metric. @@ -986,13 +986,13 @@ def _embedding_retrieval( return self._search_documents(search_params) async def _embedding_retrieval_async( - self, - query_embedding: list[float], - *, - filters: Optional[dict[str, Any]] = None, - top_k: int = 10, - custom_query: Optional[dict[str, Any]] = None, - efficient_filtering: bool = False, + self, + query_embedding: list[float], + *, + filters: Optional[dict[str, Any]] = None, + top_k: int = 10, + custom_query: Optional[dict[str, Any]] = None, + efficient_filtering: bool = False, ) -> list[Document]: """ Asynchronously retrieves documents that are most similar to the query embedding using a vector similarity @@ -1073,7 +1073,8 @@ def get_field_min_max(self, metadata_field: str) -> dict[str, Any]: pass def get_field_unique_values( - self, metadata_field: str, search_term: str | None, from_: int, size: int)-> tuple[list[str], int]: + self, metadata_field: str, search_term: str | None, from_: int, size: int + ) -> tuple[list[str], int]: pass def query_sql(self, query: str): From a28bb2aecfeb2699c5ba6310b748c6347abbf388 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 5 Jan 2026 16:19:32 +0100 Subject: [PATCH 03/58] adding count distinct metadata values --- .../opensearch/document_store.py | 106 +++++++++++++++++- .../opensearch/tests/test_document_store.py | 47 ++++++++ .../tests/test_document_store_async.py | 48 ++++++++ 3 files changed, 200 insertions(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 53a81b182d..09a1ab9502 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1064,7 +1064,111 @@ async def count_documents_by_filter_async(self, filters: dict) -> int: return (await self._async_client.count(index=self._index, body=body))["count"] def count_distinct_values_by_filter(self, filters: dict) -> dict[str, int]: - pass + """ + Returns the number of unique values for each meta field of the documents that match the provided filters. + + :param filters: The filters to apply to count documents. + For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) + :returns: The number of unique values for each meta field of the documents that match the filters. + """ + self._ensure_initialized() + assert self._client is not None + + # use index mapping to get all fields + mapping = self._client.indices.get_mapping(index=self._index) + index_mapping = mapping[self._index]["mappings"]["properties"] + + # aggregations for each metadata field (exclude special fields) + special_fields = {"content", "embedding", "id", "score", "blob", "sparse_embedding"} + aggs = {} + for field_name in index_mapping.keys(): + if field_name not in special_fields: + aggs[f"{field_name}_cardinality"] = {"cardinality": {"field": field_name}} + + if not aggs: + return {} + + # search query with filters and aggregations + if filters: + normalized_filters = normalize_filters(filters) + body = { + "query": {"bool": {"filter": normalized_filters}}, + "aggs": aggs, + "size": 0, # We only need aggregations, not documents + } + else: + # No filters - match all documents + body = { + "query": {"match_all": {}}, + "aggs": aggs, + "size": 0, # We only need aggregations, not documents + } + result = self._client.search(index=self._index, body=body) + + # extract cardinality values for each field + distinct_counts = {} + aggregations = result.get("aggregations", {}) + for field_name in index_mapping.keys(): + if field_name not in special_fields: + agg_key = f"{field_name}_cardinality" + if agg_key in aggregations: + distinct_counts[field_name] = aggregations[agg_key]["value"] + + return distinct_counts + + async def count_distinct_values_by_filter_async(self, filters: dict) -> dict[str, int]: + """ + Asynchronously returns the number of unique values for each meta field of the documents that match the + provided filters. + + :param filters: The filters to apply to count documents. + For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) + :returns: The number of unique values for each meta field of the documents that match the filters. + """ + await self._ensure_initialized_async() + assert self._async_client is not None + + # use index mapping to get all fields + mapping = await self._async_client.indices.get_mapping(index=self._index) + index_mapping = mapping[self._index]["mappings"]["properties"] + + # aggregations for each metadata field (exclude special fields) + special_fields = {"content", "embedding", "id", "score", "blob", "sparse_embedding"} + aggs = {} + for field_name in index_mapping.keys(): + if field_name not in special_fields: + aggs[f"{field_name}_cardinality"] = {"cardinality": {"field": field_name}} + + if not aggs: + return {} + + # search query with filters and aggregations + if filters: + normalized_filters = normalize_filters(filters) + body = { + "query": {"bool": {"filter": normalized_filters}}, + "aggs": aggs, + "size": 0, # We only need aggregations, not documents + } + else: + # No filters - match all documents + body = { + "query": {"match_all": {}}, + "aggs": aggs, + "size": 0, # We only need aggregations, not documents + } + result = await self._async_client.search(index=self._index, body=body) + + # extract cardinality values for each field + distinct_counts = {} + aggregations = result.get("aggregations", {}) + for field_name in index_mapping.keys(): + if field_name not in special_fields: + agg_key = f"{field_name}_cardinality" + if agg_key in aggregations: + distinct_counts[field_name] = aggregations[agg_key]["value"] + + return distinct_counts def get_fields_info(self) -> dict[str, dict]: pass diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index c35a9449b2..393d1cc74d 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -602,3 +602,50 @@ def test_count_documents_by_filter(self, document_store: OpenSearchDocumentStore } ) assert count_a_active == 2 + + def test_count_distinct_values_by_filter(self, document_store: OpenSearchDocumentStore): + docs = [ + Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1}), + Document(content="Doc 2", meta={"category": "B", "status": "active", "priority": 2}), + Document(content="Doc 3", meta={"category": "A", "status": "inactive", "priority": 1}), + Document(content="Doc 4", meta={"category": "A", "status": "active", "priority": 3}), + Document(content="Doc 5", meta={"category": "C", "status": "active", "priority": 2}), + ] + document_store.write_documents(docs) + assert document_store.count_documents() == 5 + + # Count distinct values for all documents + distinct_counts = document_store.count_distinct_values_by_filter(filters={}) + assert distinct_counts["category"] == 3 # A, B, C + assert distinct_counts["status"] == 2 # active, inactive + assert distinct_counts["priority"] == 3 # 1, 2, 3 + + # Count distinct values for documents with category="A" + distinct_counts_a = document_store.count_distinct_values_by_filter( + filters={"field": "meta.category", "operator": "==", "value": "A"} + ) + assert distinct_counts_a["category"] == 1 # Only A + assert distinct_counts_a["status"] == 2 # active, inactive + assert distinct_counts_a["priority"] == 2 # 1, 3 + + # Count distinct values for documents with status="active" + distinct_counts_active = document_store.count_distinct_values_by_filter( + filters={"field": "meta.status", "operator": "==", "value": "active"} + ) + assert distinct_counts_active["category"] == 3 # A, B, C + assert distinct_counts_active["status"] == 1 # Only active + assert distinct_counts_active["priority"] == 3 # 1, 2, 3 + + # Count distinct values with complex filter (category="A" AND status="active") + distinct_counts_a_active = document_store.count_distinct_values_by_filter( + filters={ + "operator": "AND", + "conditions": [ + {"field": "meta.category", "operator": "==", "value": "A"}, + {"field": "meta.status", "operator": "==", "value": "active"}, + ], + } + ) + assert distinct_counts_a_active["category"] == 1 # Only A + assert distinct_counts_a_active["status"] == 1 # Only active + assert distinct_counts_a_active["priority"] == 2 # 1, 3 diff --git a/integrations/opensearch/tests/test_document_store_async.py b/integrations/opensearch/tests/test_document_store_async.py index 783bfa6d11..f3cd7922ff 100644 --- a/integrations/opensearch/tests/test_document_store_async.py +++ b/integrations/opensearch/tests/test_document_store_async.py @@ -270,6 +270,54 @@ async def test_count_documents_by_filter(self, document_store: OpenSearchDocumen ) assert count_a_active == 2 + @pytest.mark.asyncio + async def test_count_distinct_values_by_filter(self, document_store: OpenSearchDocumentStore): + filterable_docs = [ + Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1}), + Document(content="Doc 2", meta={"category": "B", "status": "active", "priority": 2}), + Document(content="Doc 3", meta={"category": "A", "status": "inactive", "priority": 1}), + Document(content="Doc 4", meta={"category": "A", "status": "active", "priority": 3}), + Document(content="Doc 5", meta={"category": "C", "status": "active", "priority": 2}), + ] + await document_store.write_documents_async(filterable_docs) + assert await document_store.count_documents_async() == 5 + + # count distinct values for all documents + distinct_counts = await document_store.count_distinct_values_by_filter_async(filters={}) + assert distinct_counts["category"] == 3 # A, B, C + assert distinct_counts["status"] == 2 # active, inactive + assert distinct_counts["priority"] == 3 # 1, 2, 3 + + # count distinct values for documents with category="A" + distinct_counts_a = await document_store.count_distinct_values_by_filter_async( + filters={"field": "meta.category", "operator": "==", "value": "A"} + ) + assert distinct_counts_a["category"] == 1 # Only A + assert distinct_counts_a["status"] == 2 # active, inactive + assert distinct_counts_a["priority"] == 2 # 1, 3 + + # count distinct values for documents with status="active" + distinct_counts_active = await document_store.count_distinct_values_by_filter_async( + filters={"field": "meta.status", "operator": "==", "value": "active"} + ) + assert distinct_counts_active["category"] == 3 # A, B, C + assert distinct_counts_active["status"] == 1 # Only active + assert distinct_counts_active["priority"] == 3 # 1, 2, 3 + + # count distinct values with complex filter (category="A" AND status="active") + distinct_counts_a_active = await document_store.count_distinct_values_by_filter_async( + filters={ + "operator": "AND", + "conditions": [ + {"field": "meta.category", "operator": "==", "value": "A"}, + {"field": "meta.status", "operator": "==", "value": "active"}, + ], + } + ) + assert distinct_counts_a_active["category"] == 1 # Only A + assert distinct_counts_a_active["status"] == 1 # Only active + assert distinct_counts_a_active["priority"] == 2 # 1, 3 + @pytest.mark.asyncio async def test_delete_documents(self, document_store: OpenSearchDocumentStore): doc = Document(content="test doc") From b0b594caf5d3ee8e2c95d18fb83991c5c35e9535 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 5 Jan 2026 16:26:20 +0100 Subject: [PATCH 04/58] refactoring to reduce duplicated code --- .../opensearch/document_store.py | 115 ++++++++++-------- 1 file changed, 61 insertions(+), 54 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 09a1ab9502..a3533b241e 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1063,59 +1063,92 @@ async def count_documents_by_filter_async(self, filters: dict) -> int: body = {"query": {"bool": {"filter": normalized_filters}}} return (await self._async_client.count(index=self._index, body=body))["count"] - def count_distinct_values_by_filter(self, filters: dict) -> dict[str, int]: + @staticmethod + def _build_cardinality_aggregations(index_mapping: dict[str, Any]) -> dict[str, Any]: """ - Returns the number of unique values for each meta field of the documents that match the provided filters. + Builds cardinality aggregations for all metadata fields in the index mapping. - :param filters: The filters to apply to count documents. - For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) - :returns: The number of unique values for each meta field of the documents that match the filters. + :param index_mapping: The properties mapping from the index. + :returns: Dictionary of aggregations keyed by field name. """ - self._ensure_initialized() - assert self._client is not None - - # use index mapping to get all fields - mapping = self._client.indices.get_mapping(index=self._index) - index_mapping = mapping[self._index]["mappings"]["properties"] - - # aggregations for each metadata field (exclude special fields) special_fields = {"content", "embedding", "id", "score", "blob", "sparse_embedding"} aggs = {} for field_name in index_mapping.keys(): if field_name not in special_fields: aggs[f"{field_name}_cardinality"] = {"cardinality": {"field": field_name}} + return aggs - if not aggs: - return {} + @staticmethod + def _build_distinct_values_query_body(filters: dict, aggs: dict[str, Any]) -> dict[str, Any]: + """ + Builds the query body for distinct values counting with filters and aggregations. - # search query with filters and aggregations + :param filters: The filters to apply, or empty dict for no filters. + :param aggs: The aggregations to include in the query. + :returns: The query body dictionary. + """ if filters: normalized_filters = normalize_filters(filters) - body = { + return { "query": {"bool": {"filter": normalized_filters}}, "aggs": aggs, "size": 0, # We only need aggregations, not documents } else: # No filters - match all documents - body = { + return { "query": {"match_all": {}}, "aggs": aggs, "size": 0, # We only need aggregations, not documents } - result = self._client.search(index=self._index, body=body) - # extract cardinality values for each field + @staticmethod + def _extract_distinct_counts_from_aggregations( + aggregations: dict[str, Any], index_mapping: dict[str, Any] + ) -> dict[str, int]: + """ + Extracts distinct value counts from search result aggregations. + + :param aggregations: The aggregations from the search result. + :param index_mapping: The properties mapping from the index. + :returns: Dictionary mapping field names to their distinct value counts. + """ + special_fields = {"content", "embedding", "id", "score", "blob", "sparse_embedding"} distinct_counts = {} - aggregations = result.get("aggregations", {}) for field_name in index_mapping.keys(): if field_name not in special_fields: agg_key = f"{field_name}_cardinality" if agg_key in aggregations: distinct_counts[field_name] = aggregations[agg_key]["value"] - return distinct_counts + def count_distinct_values_by_filter(self, filters: dict) -> dict[str, int]: + """ + Returns the number of unique values for each meta field of the documents that match the provided filters. + + :param filters: The filters to apply to count documents. + For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) + :returns: The number of unique values for each meta field of the documents that match the filters. + """ + self._ensure_initialized() + assert self._client is not None + + # use index mapping to get all fields + mapping = self._client.indices.get_mapping(index=self._index) + index_mapping = mapping[self._index]["mappings"]["properties"] + + # build aggregations for each metadata field + aggs = self._build_cardinality_aggregations(index_mapping) + if not aggs: + return {} + + # build and execute search query + body = self._build_distinct_values_query_body(filters, aggs) + result = self._client.search(index=self._index, body=body) + + # extract cardinality values from aggregations + return self._extract_distinct_counts_from_aggregations(result.get("aggregations", {}), index_mapping) + async def count_distinct_values_by_filter_async(self, filters: dict) -> dict[str, int]: """ Asynchronously returns the number of unique values for each meta field of the documents that match the @@ -1132,43 +1165,17 @@ async def count_distinct_values_by_filter_async(self, filters: dict) -> dict[str mapping = await self._async_client.indices.get_mapping(index=self._index) index_mapping = mapping[self._index]["mappings"]["properties"] - # aggregations for each metadata field (exclude special fields) - special_fields = {"content", "embedding", "id", "score", "blob", "sparse_embedding"} - aggs = {} - for field_name in index_mapping.keys(): - if field_name not in special_fields: - aggs[f"{field_name}_cardinality"] = {"cardinality": {"field": field_name}} - + # build aggregations for each metadata field + aggs = self._build_cardinality_aggregations(index_mapping) if not aggs: return {} - # search query with filters and aggregations - if filters: - normalized_filters = normalize_filters(filters) - body = { - "query": {"bool": {"filter": normalized_filters}}, - "aggs": aggs, - "size": 0, # We only need aggregations, not documents - } - else: - # No filters - match all documents - body = { - "query": {"match_all": {}}, - "aggs": aggs, - "size": 0, # We only need aggregations, not documents - } + # build and execute search query + body = self._build_distinct_values_query_body(filters, aggs) result = await self._async_client.search(index=self._index, body=body) - # extract cardinality values for each field - distinct_counts = {} - aggregations = result.get("aggregations", {}) - for field_name in index_mapping.keys(): - if field_name not in special_fields: - agg_key = f"{field_name}_cardinality" - if agg_key in aggregations: - distinct_counts[field_name] = aggregations[agg_key]["value"] - - return distinct_counts + # extract cardinality values from aggregations + return self._extract_distinct_counts_from_aggregations(result.get("aggregations", {}), index_mapping) def get_fields_info(self) -> dict[str, dict]: pass From b23274fd3331c439f7eab7f7bbbd1865ba5e131f Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 5 Jan 2026 16:59:04 +0100 Subject: [PATCH 05/58] adding get metadata info --- .../opensearch/document_store.py | 25 ++++++++++++++++++- .../opensearch/tests/test_document_store.py | 24 ++++++++++++++++++ .../tests/test_document_store_async.py | 25 +++++++++++++++++++ 3 files changed, 73 insertions(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index a3533b241e..88a7150e81 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1178,7 +1178,30 @@ async def count_distinct_values_by_filter_async(self, filters: dict) -> dict[str return self._extract_distinct_counts_from_aggregations(result.get("aggregations", {}), index_mapping) def get_fields_info(self) -> dict[str, dict]: - pass + """ + Returns the information about the fields in the index. + + :returns: The information about the fields in the index. + """ + self._ensure_initialized() + assert self._client is not None + + mapping = self._client.indices.get_mapping(index=self._index) + index_mapping = mapping[self._index]["mappings"]["properties"] + return index_mapping + + async def get_fields_info_async(self) -> dict[str, dict]: + """ + Asynchronously returns the information about the fields in the index. + + :returns: The information about the fields in the index. + """ + await self._ensure_initialized_async() + assert self._async_client is not None + + mapping = await self._async_client.indices.get_mapping(index=self._index) + index_mapping = mapping[self._index]["mappings"]["properties"] + return index_mapping def get_field_min_max(self, metadata_field: str) -> dict[str, Any]: pass diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index 393d1cc74d..a8d033dcbd 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -649,3 +649,27 @@ def test_count_distinct_values_by_filter(self, document_store: OpenSearchDocumen assert distinct_counts_a_active["category"] == 1 # Only A assert distinct_counts_a_active["status"] == 1 # Only active assert distinct_counts_a_active["priority"] == 2 # 1, 3 + + def test_get_fields_info(self, document_store: OpenSearchDocumentStore): + docs = [ + Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1}), + Document(content="Doc 2", meta={"category": "B", "status": "inactive"}), + ] + document_store.write_documents(docs) + + fields_info = document_store.get_fields_info() + + # Verify that fields_info contains expected fields + assert "content" in fields_info + assert "embedding" in fields_info + assert "category" in fields_info + assert "status" in fields_info + assert "priority" in fields_info + + # Verify field types + assert fields_info["content"]["type"] == "text" + assert fields_info["embedding"]["type"] == "knn_vector" + # Metadata fields should be keyword type (from dynamic templates) + assert fields_info["category"]["type"] == "keyword" + assert fields_info["status"]["type"] == "keyword" + assert fields_info["priority"]["type"] == "long" diff --git a/integrations/opensearch/tests/test_document_store_async.py b/integrations/opensearch/tests/test_document_store_async.py index f3cd7922ff..58844ea1b5 100644 --- a/integrations/opensearch/tests/test_document_store_async.py +++ b/integrations/opensearch/tests/test_document_store_async.py @@ -436,3 +436,28 @@ async def test_update_by_filter_async(self, document_store: OpenSearchDocumentSt ) assert len(draft_docs) == 1 assert draft_docs[0].meta["category"] == "B" + + @pytest.mark.asyncio + async def test_get_fields_info(self, document_store: OpenSearchDocumentStore): + filterable_docs = [ + Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1}), + Document(content="Doc 2", meta={"category": "B", "status": "inactive"}), + ] + await document_store.write_documents_async(filterable_docs) + + fields_info = await document_store.get_fields_info_async() + + # Verify that fields_info contains expected fields + assert "content" in fields_info + assert "embedding" in fields_info + assert "category" in fields_info + assert "status" in fields_info + assert "priority" in fields_info + + # Verify field types + assert fields_info["content"]["type"] == "text" + assert fields_info["embedding"]["type"] == "knn_vector" + # Metadata fields should be keyword type (from dynamic templates) + assert fields_info["category"]["type"] == "keyword" + assert fields_info["status"]["type"] == "keyword" + assert fields_info["priority"]["type"] == "long" From 22e160d6c800fab3f43d2ae36e717148d5724f45 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 5 Jan 2026 17:17:45 +0100 Subject: [PATCH 06/58] adding get_field_max_min --- .../opensearch/document_store.py | 66 ++++++++++++++++++- .../opensearch/tests/test_document_store.py | 26 ++++++++ .../tests/test_document_store_async.py | 27 ++++++++ 3 files changed, 118 insertions(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 88a7150e81..4b508a1700 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1203,8 +1203,72 @@ async def get_fields_info_async(self) -> dict[str, dict]: index_mapping = mapping[self._index]["mappings"]["properties"] return index_mapping + @staticmethod + def _normalize_metadata_field_name(metadata_field: str) -> str: + """ + Normalizes a metadata field name by removing the "meta." prefix if present. + """ + return metadata_field[5:] if metadata_field.startswith("meta.") else metadata_field + + @staticmethod + def _build_min_max_query_body(field_name: str) -> dict[str, Any]: + """ + Builds the query body for getting min and max values using stats aggregation. + """ + return { + "query": {"match_all": {}}, + "aggs": { + "field_stats": { + "stats": { + "field": field_name, + } + } + }, + "size": 0, # We only need aggregations, not documents + } + + @staticmethod + def _extract_min_max_from_stats(stats: dict[str, Any]) -> dict[str, Any]: + """ + Extracts min and max values from stats aggregation results. + """ + min_value = stats.get("min") + max_value = stats.get("max") + return {"min": min_value, "max": max_value} + def get_field_min_max(self, metadata_field: str) -> dict[str, Any]: - pass + """ + Returns the minimum and maximum values for the given metadata field. + + :param metadata_field: The metadata field to get the minimum and maximum values for. + :returns: The minimum and maximum values for the given metadata field. + """ + self._ensure_initialized() + assert self._client is not None + + field_name = self._normalize_metadata_field_name(metadata_field) + body = self._build_min_max_query_body(field_name) + result = self._client.search(index=self._index, body=body) + stats = result.get("aggregations", {}).get("field_stats", {}) + + return self._extract_min_max_from_stats(stats) + + async def get_field_min_max_async(self, metadata_field: str) -> dict[str, Any]: + """ + Asynchronously returns the minimum and maximum values for the given metadata field. + + :param metadata_field: The metadata field to get the minimum and maximum values for. + :returns: The minimum and maximum values for the given metadata field. + """ + await self._ensure_initialized_async() + assert self._async_client is not None + + field_name = self._normalize_metadata_field_name(metadata_field) + body = self._build_min_max_query_body(field_name) + result = await self._async_client.search(index=self._index, body=body) + stats = result.get("aggregations", {}).get("field_stats", {}) + + return self._extract_min_max_from_stats(stats) def get_field_unique_values( self, metadata_field: str, search_term: str | None, from_: int, size: int diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index a8d033dcbd..da34210be5 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -673,3 +673,29 @@ def test_get_fields_info(self, document_store: OpenSearchDocumentStore): assert fields_info["category"]["type"] == "keyword" assert fields_info["status"]["type"] == "keyword" assert fields_info["priority"]["type"] == "long" + + def test_get_field_min_max(self, document_store: OpenSearchDocumentStore): + docs = [ + Document(content="Doc 1", meta={"priority": 1, "rating": 10}), + Document(content="Doc 2", meta={"priority": 5, "rating": 20}), + Document(content="Doc 3", meta={"priority": 3, "rating": 15}), + Document(content="Doc 4", meta={"priority": 10, "rating": 5}), + ] + document_store.write_documents(docs) + + # Test with "meta." prefix for integer field + min_max_priority = document_store.get_field_min_max("meta.priority") + assert min_max_priority["min"] == 1 + assert min_max_priority["max"] == 10 + + # Test with "meta." prefix for another integer field + min_max_rating = document_store.get_field_min_max("meta.rating") + assert min_max_rating["min"] == 5 + assert min_max_rating["max"] == 20 + + # Test with single value + single_doc = [Document(content="Doc 5", meta={"single_value": 42})] + document_store.write_documents(single_doc) + min_max_single = document_store.get_field_min_max("meta.single_value") + assert min_max_single["min"] == 42 + assert min_max_single["max"] == 42 diff --git a/integrations/opensearch/tests/test_document_store_async.py b/integrations/opensearch/tests/test_document_store_async.py index 58844ea1b5..578924536f 100644 --- a/integrations/opensearch/tests/test_document_store_async.py +++ b/integrations/opensearch/tests/test_document_store_async.py @@ -461,3 +461,30 @@ async def test_get_fields_info(self, document_store: OpenSearchDocumentStore): assert fields_info["category"]["type"] == "keyword" assert fields_info["status"]["type"] == "keyword" assert fields_info["priority"]["type"] == "long" + + @pytest.mark.asyncio + async def test_get_field_min_max(self, document_store: OpenSearchDocumentStore): + filterable_docs = [ + Document(content="Doc 1", meta={"priority": 1, "rating": 10}), + Document(content="Doc 2", meta={"priority": 5, "rating": 20}), + Document(content="Doc 3", meta={"priority": 3, "rating": 15}), + Document(content="Doc 4", meta={"priority": 10, "rating": 5}), + ] + await document_store.write_documents_async(filterable_docs) + + # Test with "meta." prefix for integer field + min_max_priority = await document_store.get_field_min_max_async("meta.priority") + assert min_max_priority["min"] == 1 + assert min_max_priority["max"] == 10 + + # Test with "meta." prefix for another integer field + min_max_rating = await document_store.get_field_min_max_async("meta.rating") + assert min_max_rating["min"] == 5 + assert min_max_rating["max"] == 20 + + # Test with single value + single_doc = [Document(content="Doc 5", meta={"single_value": 42})] + await document_store.write_documents_async(single_doc) + min_max_single = await document_store.get_field_min_max_async("meta.single_value") + assert min_max_single["min"] == 42 + assert min_max_single["max"] == 42 From 310846d44b82ef86db1162115043eafa0d581575 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 5 Jan 2026 18:09:23 +0100 Subject: [PATCH 07/58] fixing get_field_max_min --- .../opensearch/document_store.py | 25 ++++++++++++++++-- .../opensearch/tests/test_document_store.py | 20 ++++++++++---- .../tests/test_document_store_async.py | 26 ++++++++++++++----- 3 files changed, 57 insertions(+), 14 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 4b508a1700..4dbfc7a710 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -341,10 +341,31 @@ async def count_documents_async(self) -> int: @staticmethod def _deserialize_search_hits(hits: list[dict[str, Any]]) -> list[Document]: out = [] + # Fields that are not metadata (should stay at top level) + non_meta_fields = {"id", "content", "embedding", "blob", "sparse_embedding", "score"} + for hit in hits: - data = hit["_source"] + data = hit["_source"].copy() + + # Reconstruct metadata dict from flattened fields + meta = {} + fields_to_remove = [] + for key, value in data.items(): + if key not in non_meta_fields: + meta[key] = value + fields_to_remove.append(key) + + # Remove metadata fields from top level and add them to meta + for key in fields_to_remove: + data.pop(key, None) + + if meta: + data["meta"] = meta + if "highlight" in hit: - data["metadata"]["highlighted"] = hit["highlight"] + if "meta" not in data: + data["meta"] = {} + data["meta"]["highlighted"] = hit["highlight"] data["score"] = hit["_score"] out.append(Document.from_dict(data)) diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index da34210be5..fc8226fbb6 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -675,11 +675,16 @@ def test_get_fields_info(self, document_store: OpenSearchDocumentStore): assert fields_info["priority"]["type"] == "long" def test_get_field_min_max(self, document_store: OpenSearchDocumentStore): + # Test with integer values docs = [ - Document(content="Doc 1", meta={"priority": 1, "rating": 10}), - Document(content="Doc 2", meta={"priority": 5, "rating": 20}), - Document(content="Doc 3", meta={"priority": 3, "rating": 15}), - Document(content="Doc 4", meta={"priority": 10, "rating": 5}), + Document(content="Doc 1", meta={"priority": 1, "age": 10}), + Document(content="Doc 2", meta={"priority": 5, "age": 20}), + Document(content="Doc 3", meta={"priority": 3, "age": 15}), + Document(content="Doc 4", meta={"priority": 10, "age": 5}), + Document(content="Doc 6", meta={"rating": 10.5}), + Document(content="Doc 7", meta={"rating": 20.3}), + Document(content="Doc 8", meta={"rating": 15.7}), + Document(content="Doc 9", meta={"rating": 5.2}), ] document_store.write_documents(docs) @@ -689,7 +694,7 @@ def test_get_field_min_max(self, document_store: OpenSearchDocumentStore): assert min_max_priority["max"] == 10 # Test with "meta." prefix for another integer field - min_max_rating = document_store.get_field_min_max("meta.rating") + min_max_rating = document_store.get_field_min_max("meta.age") assert min_max_rating["min"] == 5 assert min_max_rating["max"] == 20 @@ -699,3 +704,8 @@ def test_get_field_min_max(self, document_store: OpenSearchDocumentStore): min_max_single = document_store.get_field_min_max("meta.single_value") assert min_max_single["min"] == 42 assert min_max_single["max"] == 42 + + # Test with float values + min_max_score = document_store.get_field_min_max("meta.rating") + assert min_max_score["min"] == pytest.approx(5.2) + assert min_max_score["max"] == pytest.approx(20.3) \ No newline at end of file diff --git a/integrations/opensearch/tests/test_document_store_async.py b/integrations/opensearch/tests/test_document_store_async.py index 578924536f..fe4f8ec726 100644 --- a/integrations/opensearch/tests/test_document_store_async.py +++ b/integrations/opensearch/tests/test_document_store_async.py @@ -464,13 +464,18 @@ async def test_get_fields_info(self, document_store: OpenSearchDocumentStore): @pytest.mark.asyncio async def test_get_field_min_max(self, document_store: OpenSearchDocumentStore): - filterable_docs = [ - Document(content="Doc 1", meta={"priority": 1, "rating": 10}), - Document(content="Doc 2", meta={"priority": 5, "rating": 20}), - Document(content="Doc 3", meta={"priority": 3, "rating": 15}), - Document(content="Doc 4", meta={"priority": 10, "rating": 5}), + # Test with integer values + docs = [ + Document(content="Doc 1", meta={"priority": 1, "age": 10}), + Document(content="Doc 2", meta={"priority": 5, "age": 20}), + Document(content="Doc 3", meta={"priority": 3, "age": 15}), + Document(content="Doc 4", meta={"priority": 10, "age": 5}), + Document(content="Doc 6", meta={"rating": 10.5}), + Document(content="Doc 7", meta={"rating": 20.3}), + Document(content="Doc 8", meta={"rating": 15.7}), + Document(content="Doc 9", meta={"rating": 5.2}), ] - await document_store.write_documents_async(filterable_docs) + await document_store.write_documents_async(docs) # Test with "meta." prefix for integer field min_max_priority = await document_store.get_field_min_max_async("meta.priority") @@ -478,7 +483,7 @@ async def test_get_field_min_max(self, document_store: OpenSearchDocumentStore): assert min_max_priority["max"] == 10 # Test with "meta." prefix for another integer field - min_max_rating = await document_store.get_field_min_max_async("meta.rating") + min_max_rating = await document_store.get_field_min_max_async("meta.age") assert min_max_rating["min"] == 5 assert min_max_rating["max"] == 20 @@ -488,3 +493,10 @@ async def test_get_field_min_max(self, document_store: OpenSearchDocumentStore): min_max_single = await document_store.get_field_min_max_async("meta.single_value") assert min_max_single["min"] == 42 assert min_max_single["max"] == 42 + + # Test with float values + min_max_score = await document_store.get_field_min_max_async("meta.rating") + assert min_max_score["min"] == pytest.approx(5.2) + assert min_max_score["max"] == pytest.approx(20.3) + + \ No newline at end of file From e0be21f351fd7028a048f10eaec932d621ad3f40 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 5 Jan 2026 18:50:45 +0100 Subject: [PATCH 08/58] adding get_field_unique_values --- .../opensearch/document_store.py | 58 ++++++++++++++++- .../opensearch/tests/test_document_store.py | 63 ++++++++++++++++++- 2 files changed, 119 insertions(+), 2 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 4dbfc7a710..fbf6419d82 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1294,7 +1294,63 @@ async def get_field_min_max_async(self, metadata_field: str) -> dict[str, Any]: def get_field_unique_values( self, metadata_field: str, search_term: str | None, from_: int, size: int ) -> tuple[list[str], int]: - pass + """ + Returns unique values for a metadata field, optionally filtered by a search term in the content. + + :param metadata_field: The metadata field to get unique values for. + :param search_term: Optional search term to filter documents by matching in the content field. + :param from_: The starting index for pagination. + :param size: The number of unique values to return. + :returns: A tuple containing (list of unique values, total count of unique values). + """ + self._ensure_initialized() + assert self._client is not None + + field_name = self._normalize_metadata_field_name(metadata_field) + + # filter by search_term if provided + query = {"match_all": {}} + if search_term: + # Use match_phrase for exact phrase matching to avoid tokenization issues + query = {"match_phrase": {"content": search_term}} + + # Build aggregations + # Terms aggregation for paginated unique values + # Note: Terms aggregation doesn't support 'from' parameter directly, + # so we fetch from_ + size results and slice them + # Cardinality aggregation for total count + terms_size = from_ + size if from_ > 0 else size + body = { + "query": query, + "aggs": { + "unique_values": { + "terms": { + "field": field_name, + "size": terms_size, + } + }, + "total_count": { + "cardinality": { + "field": field_name, + } + }, + }, + "size": 0, # we only need aggregations, not documents + } + + result = self._client.search(index=self._index, body=body) + aggregations = result.get("aggregations", {}) + + # Extract unique values from terms aggregation buckets + unique_values_buckets = aggregations.get("unique_values", {}).get("buckets", []) + # Apply pagination by slicing the results + paginated_buckets = unique_values_buckets[from_ : from_ + size] + unique_values = [str(bucket["key"]) for bucket in paginated_buckets] + + # Extract total count from cardinality aggregation + total_count = int(aggregations.get("total_count", {}).get("value", 0)) + + return unique_values, total_count def query_sql(self, query: str): pass diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index fc8226fbb6..90d318dc71 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -708,4 +708,65 @@ def test_get_field_min_max(self, document_store: OpenSearchDocumentStore): # Test with float values min_max_score = document_store.get_field_min_max("meta.rating") assert min_max_score["min"] == pytest.approx(5.2) - assert min_max_score["max"] == pytest.approx(20.3) \ No newline at end of file + assert min_max_score["max"] == pytest.approx(20.3) + + def test_get_field_unique_values(self, document_store: OpenSearchDocumentStore): + # Test with string values + docs = [ + Document(content="Python programming", meta={"category": "A", "language": "Python"}), + Document(content="Java programming", meta={"category": "B", "language": "Java"}), + Document(content="Python scripting", meta={"category": "A", "language": "Python"}), + Document(content="JavaScript development", meta={"category": "C", "language": "JavaScript"}), + Document(content="Python data science", meta={"category": "A", "language": "Python"}), + Document(content="Java backend", meta={"category": "B", "language": "Java"}), + ] + document_store.write_documents(docs) + + # Test getting all unique values without search term + unique_values, total_count = document_store.get_field_unique_values("meta.category", None, 0, 10) + assert set(unique_values) == {"A", "B", "C"} + assert total_count == 3 + + # Test with "meta." prefix + unique_languages, lang_count = document_store.get_field_unique_values("meta.language", None, 0, 10) + assert set(unique_languages) == {"Python", "Java", "JavaScript"} + assert lang_count == 3 + + # Test pagination - first page + unique_values_page1, total_count = document_store.get_field_unique_values("meta.category", None, 0, 2) + assert len(unique_values_page1) == 2 + assert total_count == 3 + assert all(val in ["A", "B", "C"] for val in unique_values_page1) + + # Test pagination - second page + unique_values_page2, total_count = document_store.get_field_unique_values("meta.category", None, 2, 2) + assert len(unique_values_page2) == 1 + assert total_count == 3 + assert unique_values_page2[0] in ["A", "B", "C"] + + # Test with search term - filter by content matching "Python" + unique_values_filtered, total_count = document_store.get_field_unique_values("meta.category", "Python", 0, 10) + assert set(unique_values_filtered) == {"A"} # Only category A has documents with "Python" in content + assert total_count == 1 + + # Test with search term - filter by content matching "Java" + unique_values_java, total_count = document_store.get_field_unique_values("meta.category", "Java", 0, 10) + assert set(unique_values_java) == {"B"} # Only category B has documents with "Java" in content + assert total_count == 1 + + # Test with integer values + int_docs = [ + Document(content="Doc 1", meta={"priority": 1}), + Document(content="Doc 2", meta={"priority": 2}), + Document(content="Doc 3", meta={"priority": 1}), + Document(content="Doc 4", meta={"priority": 3}), + ] + document_store.write_documents(int_docs) + unique_priorities, priority_count = document_store.get_field_unique_values("meta.priority", None, 0, 10) + assert set(unique_priorities) == {"1", "2", "3"} + assert priority_count == 3 + + # Test with search term on integer field + unique_priorities_filtered, priority_count = document_store.get_field_unique_values("meta.priority", "Doc 1", 0, 10) + assert set(unique_priorities_filtered) == {"1"} + assert priority_count == 1 \ No newline at end of file From e6932b0d73445d350f134ef8988e35c458a8fcbd Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 5 Jan 2026 18:51:59 +0100 Subject: [PATCH 09/58] adding get_field_unique_values async --- .../opensearch/document_store.py | 61 ++++++++++++++++++ .../tests/test_document_store_async.py | 62 +++++++++++++++++++ 2 files changed, 123 insertions(+) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index fbf6419d82..cd902f3ac4 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1352,5 +1352,66 @@ def get_field_unique_values( return unique_values, total_count + async def get_field_unique_values_async( + self, metadata_field: str, search_term: str | None, from_: int, size: int + ) -> tuple[list[str], int]: + """ + Asynchronously returns unique values for a metadata field, optionally filtered by a search term in the content. + + :param metadata_field: The metadata field to get unique values for. + :param search_term: Optional search term to filter documents by matching in the content field. + :param from_: The starting index for pagination. + :param size: The number of unique values to return. + :returns: A tuple containing (list of unique values, total count of unique values). + """ + await self._ensure_initialized_async() + assert self._async_client is not None + + field_name = self._normalize_metadata_field_name(metadata_field) + + # filter by search_term if provided + query = {"match_all": {}} + if search_term: + # Use match_phrase for exact phrase matching to avoid tokenization issues + query = {"match_phrase": {"content": search_term}} + + # Build aggregations + # Terms aggregation for paginated unique values + # Note: Terms aggregation doesn't support 'from' parameter directly, + # so we fetch from_ + size results and slice them + # Cardinality aggregation for total count + terms_size = from_ + size if from_ > 0 else size + body = { + "query": query, + "aggs": { + "unique_values": { + "terms": { + "field": field_name, + "size": terms_size, + } + }, + "total_count": { + "cardinality": { + "field": field_name, + } + }, + }, + "size": 0, # we only need aggregations, not documents + } + + result = await self._async_client.search(index=self._index, body=body) + aggregations = result.get("aggregations", {}) + + # Extract unique values from terms aggregation buckets + unique_values_buckets = aggregations.get("unique_values", {}).get("buckets", []) + # Apply pagination by slicing the results + paginated_buckets = unique_values_buckets[from_ : from_ + size] + unique_values = [str(bucket["key"]) for bucket in paginated_buckets] + + # Extract total count from cardinality aggregation + total_count = int(aggregations.get("total_count", {}).get("value", 0)) + + return unique_values, total_count + def query_sql(self, query: str): pass diff --git a/integrations/opensearch/tests/test_document_store_async.py b/integrations/opensearch/tests/test_document_store_async.py index fe4f8ec726..28bdebe64a 100644 --- a/integrations/opensearch/tests/test_document_store_async.py +++ b/integrations/opensearch/tests/test_document_store_async.py @@ -499,4 +499,66 @@ async def test_get_field_min_max(self, document_store: OpenSearchDocumentStore): assert min_max_score["min"] == pytest.approx(5.2) assert min_max_score["max"] == pytest.approx(20.3) + @pytest.mark.asyncio + async def test_get_field_unique_values(self, document_store: OpenSearchDocumentStore): + # Test with string values + docs = [ + Document(content="Python programming", meta={"category": "A", "language": "Python"}), + Document(content="Java programming", meta={"category": "B", "language": "Java"}), + Document(content="Python scripting", meta={"category": "A", "language": "Python"}), + Document(content="JavaScript development", meta={"category": "C", "language": "JavaScript"}), + Document(content="Python data science", meta={"category": "A", "language": "Python"}), + Document(content="Java backend", meta={"category": "B", "language": "Java"}), + ] + await document_store.write_documents_async(docs) + + # Test getting all unique values without search term + unique_values, total_count = await document_store.get_field_unique_values_async("meta.category", None, 0, 10) + assert set(unique_values) == {"A", "B", "C"} + assert total_count == 3 + + # Test with "meta." prefix + unique_languages, lang_count = await document_store.get_field_unique_values_async("meta.language", None, 0, 10) + assert set(unique_languages) == {"Python", "Java", "JavaScript"} + assert lang_count == 3 + + # Test pagination - first page + unique_values_page1, total_count = await document_store.get_field_unique_values_async("meta.category", None, 0, 2) + assert len(unique_values_page1) == 2 + assert total_count == 3 + assert all(val in ["A", "B", "C"] for val in unique_values_page1) + + # Test pagination - second page + unique_values_page2, total_count = await document_store.get_field_unique_values_async("meta.category", None, 2, 2) + assert len(unique_values_page2) == 1 + assert total_count == 3 + assert unique_values_page2[0] in ["A", "B", "C"] + + # Test with search term - filter by content matching "Python" + unique_values_filtered, total_count = await document_store.get_field_unique_values_async("meta.category", "Python", 0, 10) + assert set(unique_values_filtered) == {"A"} # Only category A has documents with "Python" in content + assert total_count == 1 + + # Test with search term - filter by content matching "Java" + unique_values_java, total_count = await document_store.get_field_unique_values_async("meta.category", "Java", 0, 10) + assert set(unique_values_java) == {"B"} # Only category B has documents with "Java" in content + assert total_count == 1 + + # Test with integer values + int_docs = [ + Document(content="Doc 1", meta={"priority": 1}), + Document(content="Doc 2", meta={"priority": 2}), + Document(content="Doc 3", meta={"priority": 1}), + Document(content="Doc 4", meta={"priority": 3}), + ] + await document_store.write_documents_async(int_docs) + unique_priorities, priority_count = await document_store.get_field_unique_values_async("meta.priority", None, 0, 10) + assert set(unique_priorities) == {"1", "2", "3"} + assert priority_count == 3 + + # Test with search term on integer field + unique_priorities_filtered, priority_count = await document_store.get_field_unique_values_async("meta.priority", "Doc 1", 0, 10) + assert set(unique_priorities_filtered) == {"1"} + assert priority_count == 1 + \ No newline at end of file From 5e7cd906bfc6666c4c167b194c6f97b75ba355fd Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 5 Jan 2026 18:58:35 +0100 Subject: [PATCH 10/58] formmatting --- .../opensearch/document_store.py | 10 +++---- .../opensearch/tests/test_document_store.py | 8 +++--- .../tests/test_document_store_async.py | 26 +++++++++++++------ 3 files changed, 28 insertions(+), 16 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index cd902f3ac4..26eba98540 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -343,10 +343,10 @@ def _deserialize_search_hits(hits: list[dict[str, Any]]) -> list[Document]: out = [] # Fields that are not metadata (should stay at top level) non_meta_fields = {"id", "content", "embedding", "blob", "sparse_embedding", "score"} - + for hit in hits: data = hit["_source"].copy() - + # Reconstruct metadata dict from flattened fields meta = {} fields_to_remove = [] @@ -354,14 +354,14 @@ def _deserialize_search_hits(hits: list[dict[str, Any]]) -> list[Document]: if key not in non_meta_fields: meta[key] = value fields_to_remove.append(key) - + # Remove metadata fields from top level and add them to meta for key in fields_to_remove: data.pop(key, None) - + if meta: data["meta"] = meta - + if "highlight" in hit: if "meta" not in data: data["meta"] = {} diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index 90d318dc71..517f6e8435 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -704,7 +704,7 @@ def test_get_field_min_max(self, document_store: OpenSearchDocumentStore): min_max_single = document_store.get_field_min_max("meta.single_value") assert min_max_single["min"] == 42 assert min_max_single["max"] == 42 - + # Test with float values min_max_score = document_store.get_field_min_max("meta.rating") assert min_max_score["min"] == pytest.approx(5.2) @@ -767,6 +767,8 @@ def test_get_field_unique_values(self, document_store: OpenSearchDocumentStore): assert priority_count == 3 # Test with search term on integer field - unique_priorities_filtered, priority_count = document_store.get_field_unique_values("meta.priority", "Doc 1", 0, 10) + unique_priorities_filtered, priority_count = document_store.get_field_unique_values( + "meta.priority", "Doc 1", 0, 10 + ) assert set(unique_priorities_filtered) == {"1"} - assert priority_count == 1 \ No newline at end of file + assert priority_count == 1 diff --git a/integrations/opensearch/tests/test_document_store_async.py b/integrations/opensearch/tests/test_document_store_async.py index 28bdebe64a..b8d9242dec 100644 --- a/integrations/opensearch/tests/test_document_store_async.py +++ b/integrations/opensearch/tests/test_document_store_async.py @@ -523,24 +523,32 @@ async def test_get_field_unique_values(self, document_store: OpenSearchDocumentS assert lang_count == 3 # Test pagination - first page - unique_values_page1, total_count = await document_store.get_field_unique_values_async("meta.category", None, 0, 2) + unique_values_page1, total_count = await document_store.get_field_unique_values_async( + "meta.category", None, 0, 2 + ) assert len(unique_values_page1) == 2 assert total_count == 3 assert all(val in ["A", "B", "C"] for val in unique_values_page1) # Test pagination - second page - unique_values_page2, total_count = await document_store.get_field_unique_values_async("meta.category", None, 2, 2) + unique_values_page2, total_count = await document_store.get_field_unique_values_async( + "meta.category", None, 2, 2 + ) assert len(unique_values_page2) == 1 assert total_count == 3 assert unique_values_page2[0] in ["A", "B", "C"] # Test with search term - filter by content matching "Python" - unique_values_filtered, total_count = await document_store.get_field_unique_values_async("meta.category", "Python", 0, 10) + unique_values_filtered, total_count = await document_store.get_field_unique_values_async( + "meta.category", "Python", 0, 10 + ) assert set(unique_values_filtered) == {"A"} # Only category A has documents with "Python" in content assert total_count == 1 # Test with search term - filter by content matching "Java" - unique_values_java, total_count = await document_store.get_field_unique_values_async("meta.category", "Java", 0, 10) + unique_values_java, total_count = await document_store.get_field_unique_values_async( + "meta.category", "Java", 0, 10 + ) assert set(unique_values_java) == {"B"} # Only category B has documents with "Java" in content assert total_count == 1 @@ -552,13 +560,15 @@ async def test_get_field_unique_values(self, document_store: OpenSearchDocumentS Document(content="Doc 4", meta={"priority": 3}), ] await document_store.write_documents_async(int_docs) - unique_priorities, priority_count = await document_store.get_field_unique_values_async("meta.priority", None, 0, 10) + unique_priorities, priority_count = await document_store.get_field_unique_values_async( + "meta.priority", None, 0, 10 + ) assert set(unique_priorities) == {"1", "2", "3"} assert priority_count == 3 # Test with search term on integer field - unique_priorities_filtered, priority_count = await document_store.get_field_unique_values_async("meta.priority", "Doc 1", 0, 10) + unique_priorities_filtered, priority_count = await document_store.get_field_unique_values_async( + "meta.priority", "Doc 1", 0, 10 + ) assert set(unique_priorities_filtered) == {"1"} assert priority_count == 1 - - \ No newline at end of file From 0c0f31cb47b659bfbda58d651038599a790e77ff Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 5 Jan 2026 23:48:22 +0100 Subject: [PATCH 11/58] updating tests --- .../opensearch/document_store.py | 171 +++++++++++++++++- .../opensearch/tests/test_document_store.py | 62 +++++++ .../tests/test_document_store_async.py | 64 +++++++ 3 files changed, 292 insertions(+), 5 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 26eba98540..57a9085125 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -2,16 +2,19 @@ # # SPDX-License-Identifier: Apache-2.0 +import json from collections.abc import Mapping from math import exp -from typing import Any, Optional, Union +from typing import Any, Literal, Optional, Union +import requests from haystack import default_from_dict, default_to_dict, logging from haystack.dataclasses import Document from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy from haystack.utils.auth import Secret from opensearchpy import AsyncHttpConnection, AsyncOpenSearch, OpenSearch +from opensearchpy.exceptions import SerializationError from opensearchpy.helpers import async_bulk, bulk from haystack_integrations.document_stores.opensearch.auth import AsyncAWSAuth, AWSAuth @@ -21,6 +24,8 @@ Hosts = Union[str, list[Union[str, Mapping[str, Union[str, int]]]]] +ResponseFormat = Literal["json", "jdbc", "csv", "raw"] + # document scores are essentially unbounded and will be scaled to values between 0 and 1 if scale_score is set to # True. Scaling uses the expit function (inverse of the logit function) after applying a scaling factor # (e.g., BM25_SCALING_FACTOR for the bm25_retrieval method). @@ -1309,7 +1314,7 @@ def get_field_unique_values( field_name = self._normalize_metadata_field_name(metadata_field) # filter by search_term if provided - query = {"match_all": {}} + query: dict[str, Any] = {"match_all": {}} if search_term: # Use match_phrase for exact phrase matching to avoid tokenization issues query = {"match_phrase": {"content": search_term}} @@ -1370,7 +1375,7 @@ async def get_field_unique_values_async( field_name = self._normalize_metadata_field_name(metadata_field) # filter by search_term if provided - query = {"match_all": {}} + query: dict[str, Any] = {"match_all": {}} if search_term: # Use match_phrase for exact phrase matching to avoid tokenization issues query = {"match_phrase": {"content": search_term}} @@ -1413,5 +1418,161 @@ async def get_field_unique_values_async( return unique_values, total_count - def query_sql(self, query: str): - pass + def query_sql(self, query: str, response_format: ResponseFormat = "json") -> Any: + """ + Execute a raw OpenSearch SQL query against the index. + + :param query: The OpenSearch SQL query to execute + :param response_format: The format of the response. See https://docs.opensearch.org/latest/search-plugins/sql/response-formats/ + :returns: The query results in the specified format. For JSON format, returns a list of dictionaries + (the _source from each hit). For other formats (csv, jdbc, raw), returns the response as text. + """ + self._ensure_initialized() + assert self._client is not None + + # For non-JSON formats, use requests directly to avoid deserialization issues + if response_format != "json": + try: + # Get connection info from the transport + connection = self._client.transport.get_connection() + base_url = connection.host + url = f"{base_url}/_plugins/_sql?format={response_format}" + + headers = {"Content-Type": "application/json"} + auth = None + if self._http_auth: + if isinstance(self._http_auth, tuple): + auth = self._http_auth + elif isinstance(self._http_auth, AWSAuth): + # For AWS auth, we need to use the opensearchpy client + # Fall through to the try/except below + pass + + verify = self._verify_certs if self._verify_certs is not None else True + timeout = self._timeout if self._timeout is not None else 30.0 + response = requests.post( + url, + json={"query": query}, + headers=headers, + auth=auth, + verify=verify, + timeout=timeout, + ) + response.raise_for_status() + return response.text + except Exception as e: + # If requests fails (e.g., AWS auth), fall back to opensearchpy + # which will raise SerializationError that we can handle + pass + + try: + body = {"query": query} + params = {"format": response_format} + + response_data = self._client.transport.perform_request( + method="POST", + url="/_plugins/_sql", + params=params, + body=body, + ) + + if response_format == "json": + # extract only the query results + if isinstance(response_data, dict) and "hits" in response_data: + hits = response_data.get("hits", {}).get("hits", []) + # extract _source from each hit, which contains the actual document data + return [hit.get("_source", {}) for hit in hits] + return response_data + else: + return response_data if isinstance(response_data, str) else str(response_data) + except SerializationError: + # If we get here, it means requests failed above (likely AWS auth) + # and opensearchpy can't deserialize the response + # Re-raise as DocumentStoreError with a helpful message + msg = f"Failed to execute SQL query in OpenSearch: Unable to deserialize {response_format} response. This format may not be supported with the current authentication method." + raise DocumentStoreError(msg) from None + except Exception as e: + msg = f"Failed to execute SQL query in OpenSearch: {e!s}" + raise DocumentStoreError(msg) from e + + async def query_sql_async(self, query: str, response_format: ResponseFormat = "json") -> Any: + """ + Asynchronously execute a raw OpenSearch SQL query against the index. + + :param query: The OpenSearch SQL query to execute + :param response_format: The format of the response. See https://docs.opensearch.org/latest/search-plugins/sql/response-formats/ + :returns: The query results in the specified format. For JSON format, returns a list of dictionaries + (the _source from each hit). For other formats (csv, jdbc, raw), returns the response as text. + """ + await self._ensure_initialized_async() + assert self._async_client is not None + + # For non-JSON formats, use httpx directly to avoid deserialization issues + if response_format != "json": + try: + import httpx + + # Get connection info from the transport + connection = self._async_client.transport.get_connection() + base_url = connection.host + url = f"{base_url}/_plugins/_sql?format={response_format}" + + headers = {"Content-Type": "application/json"} + auth = None + if self._http_auth: + if isinstance(self._http_auth, tuple): + auth = self._http_auth + elif isinstance(self._http_auth, AWSAuth): + # For AWS auth, we need to use the opensearchpy client + # Fall through to the try/except below + pass + + verify = self._verify_certs if self._verify_certs is not None else True + timeout = httpx.Timeout(self._timeout if self._timeout else 30.0) + + async with httpx.AsyncClient(verify=verify, timeout=timeout) as client: + response = await client.post( + url, + json={"query": query}, + headers=headers, + auth=auth, + ) + response.raise_for_status() + return response.text + except ImportError: + # httpx not available, fall through to opensearchpy + pass + except Exception as e: + # If httpx fails (e.g., AWS auth), fall back to opensearchpy + # which will raise SerializationError that we can handle + pass + + try: + body = {"query": query} + params = {"format": response_format} + + response_data = await self._async_client.transport.perform_request( + method="POST", + url="/_plugins/_sql", + params=params, + body=body, + ) + + if response_format == "json": + # extract only the query results + if isinstance(response_data, dict) and "hits" in response_data: + hits = response_data.get("hits", {}).get("hits", []) + # extract _source from each hit, which contains the actual document data + return [hit.get("_source", {}) for hit in hits] + return response_data + else: + return response_data if isinstance(response_data, str) else str(response_data) + except SerializationError: + # If we get here, it means httpx failed above (likely AWS auth or not installed) + # and opensearchpy can't deserialize the response + # Re-raise as DocumentStoreError with a helpful message + msg = f"Failed to execute SQL query in OpenSearch: Unable to deserialize {response_format} response. This format may not be supported with the current authentication method. Consider installing httpx for better support." + raise DocumentStoreError(msg) from None + except Exception as e: + msg = f"Failed to execute SQL query in OpenSearch: {e!s}" + raise DocumentStoreError(msg) from e diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index 517f6e8435..e6cd334290 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -772,3 +772,65 @@ def test_get_field_unique_values(self, document_store: OpenSearchDocumentStore): ) assert set(unique_priorities_filtered) == {"1"} assert priority_count == 1 + + def test_query_sql(self, document_store: OpenSearchDocumentStore): + """ + Test executing SQL queries against the OpenSearch index. + """ + docs = [ + Document(content="Python programming", meta={"category": "A", "status": "active", "priority": 1}), + Document(content="Java programming", meta={"category": "B", "status": "active", "priority": 2}), + Document(content="Python scripting", meta={"category": "A", "status": "inactive", "priority": 3}), + Document(content="JavaScript development", meta={"category": "C", "status": "active", "priority": 1}), + ] + document_store.write_documents(docs) + time.sleep(1) # Wait for documents to be indexed + + # Test SQL query with JSON format (default) + sql_query = ( + f"SELECT content, category, status, priority FROM {document_store._index} " # noqa: S608 + f"WHERE category = 'A' ORDER BY priority" + ) + result = document_store.query_sql(sql_query, response_format="json") + + # New format returns a list of dictionaries (the _source from each hit) + assert len(result) == 2 # Two documents with category A + assert isinstance(result, list) + assert all(isinstance(row, dict) for row in result) + + # Verify data contains expected values + categories = [row.get("category") for row in result] + assert all(cat == "A" for cat in categories) + + # Verify all expected fields are present + for row in result: + assert "content" in row + assert "category" in row + assert "status" in row + assert "priority" in row + + # Test SQL query with CSV format + result_csv = document_store.query_sql(sql_query, response_format="csv") + assert isinstance(result_csv, str) + assert "content" in result_csv + assert "category" in result_csv + + # Test SQL query with JDBC format + result_jdbc = document_store.query_sql(sql_query, response_format="jdbc") + # JDBC format can be dict or str depending on OpenSearch version + assert result_jdbc is not None + + # Test SQL query with RAW format + result_raw = document_store.query_sql(sql_query, response_format="raw") + assert isinstance(result_raw, str) + + # Test COUNT query + count_query = f"SELECT COUNT(*) as total FROM {document_store._index}" # noqa: S608 + count_result = document_store.query_sql(count_query, response_format="json") + # COUNT query may return different format, check it's a valid response + assert count_result is not None + + # Test error handling for invalid SQL query + invalid_query = "SELECT * FROM non_existent_index" + with pytest.raises(DocumentStoreError, match="Failed to execute SQL query"): + document_store.query_sql(invalid_query) diff --git a/integrations/opensearch/tests/test_document_store_async.py b/integrations/opensearch/tests/test_document_store_async.py index b8d9242dec..df01566f54 100644 --- a/integrations/opensearch/tests/test_document_store_async.py +++ b/integrations/opensearch/tests/test_document_store_async.py @@ -6,6 +6,7 @@ import pytest from haystack.dataclasses import Document +from haystack.document_stores.errors import DocumentStoreError from haystack.document_stores.types import DuplicatePolicy from haystack_integrations.document_stores.opensearch.document_store import OpenSearchDocumentStore @@ -572,3 +573,66 @@ async def test_get_field_unique_values(self, document_store: OpenSearchDocumentS ) assert set(unique_priorities_filtered) == {"1"} assert priority_count == 1 + + @pytest.mark.asyncio + async def test_query_sql(self, document_store: OpenSearchDocumentStore): + """ + Test executing SQL queries against the OpenSearch index. + """ + docs = [ + Document(content="Python programming", meta={"category": "A", "status": "active", "priority": 1}), + Document(content="Java programming", meta={"category": "B", "status": "active", "priority": 2}), + Document(content="Python scripting", meta={"category": "A", "status": "inactive", "priority": 3}), + Document(content="JavaScript development", meta={"category": "C", "status": "active", "priority": 1}), + ] + await document_store.write_documents_async(docs) + time.sleep(1) # Wait for documents to be indexed + + # Test SQL query with JSON format (default) + sql_query = ( + f"SELECT content, category, status, priority FROM {document_store._index} " # noqa: S608 + f"WHERE category = 'A' ORDER BY priority" + ) + result = await document_store.query_sql_async(sql_query, response_format="json") + + # New format returns a list of dictionaries (the _source from each hit) + assert len(result) == 2 # Two documents with category A + assert isinstance(result, list) + assert all(isinstance(row, dict) for row in result) + + # Verify data contains expected values + categories = [row.get("category") for row in result] + assert all(cat == "A" for cat in categories) + + # Verify all expected fields are present + for row in result: + assert "content" in row + assert "category" in row + assert "status" in row + assert "priority" in row + + # Test SQL query with CSV format + result_csv = await document_store.query_sql_async(sql_query, response_format="csv") + assert isinstance(result_csv, str) + assert "content" in result_csv + assert "category" in result_csv + + # Test SQL query with JDBC format + result_jdbc = await document_store.query_sql_async(sql_query, response_format="jdbc") + # JDBC format can be dict or str depending on OpenSearch version + assert result_jdbc is not None + + # Test SQL query with RAW format + result_raw = await document_store.query_sql_async(sql_query, response_format="raw") + assert isinstance(result_raw, str) + + # Test COUNT query + count_query = f"SELECT COUNT(*) as total FROM {document_store._index}" # noqa: S608 + count_result = await document_store.query_sql_async(count_query, response_format="json") + # COUNT query may return different format, check it's a valid response + assert count_result is not None + + # Test error handling for invalid SQL query + invalid_query = "SELECT * FROM non_existent_index" + with pytest.raises(DocumentStoreError, match="Failed to execute SQL query"): + await document_store.query_sql_async(invalid_query) From 2010261eb38e4baa75af9e94d966d51e45254654 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 6 Jan 2026 00:03:24 +0100 Subject: [PATCH 12/58] formmatting --- .../opensearch/document_store.py | 108 +++++++++--------- 1 file changed, 55 insertions(+), 53 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 57a9085125..c172047bbf 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -2,11 +2,11 @@ # # SPDX-License-Identifier: Apache-2.0 -import json from collections.abc import Mapping from math import exp from typing import Any, Literal, Optional, Union +import httpx import requests from haystack import default_from_dict, default_to_dict, logging from haystack.dataclasses import Document @@ -1418,6 +1418,39 @@ async def get_field_unique_values_async( return unique_values, total_count + def _prepare_sql_http_request_params( + self, base_url: str, response_format: ResponseFormat + ) -> tuple[str, dict[str, str], Any]: + """ + Prepares HTTP request parameters for SQL query execution. + """ + url = f"{base_url}/_plugins/_sql?format={response_format}" + headers = {"Content-Type": "application/json"} + auth = None + if self._http_auth: + if isinstance(self._http_auth, tuple): + auth = self._http_auth + elif isinstance(self._http_auth, AWSAuth): + # For AWS auth, we need to use the opensearchpy client + # Fall through to the try/except below + pass + return url, headers, auth + + @staticmethod + def _process_sql_response(response_data: Any, response_format: ResponseFormat) -> Any: + """ + Processes the SQL query response data. + """ + if response_format == "json": + # extract only the query results + if isinstance(response_data, dict) and "hits" in response_data: + hits = response_data.get("hits", {}).get("hits", []) + # extract _source from each hit, which contains the actual document data + return [hit.get("_source", {}) for hit in hits] + return response_data + else: + return response_data if isinstance(response_data, str) else str(response_data) + def query_sql(self, query: str, response_format: ResponseFormat = "json") -> Any: """ Execute a raw OpenSearch SQL query against the index. @@ -1436,18 +1469,8 @@ def query_sql(self, query: str, response_format: ResponseFormat = "json") -> Any # Get connection info from the transport connection = self._client.transport.get_connection() base_url = connection.host - url = f"{base_url}/_plugins/_sql?format={response_format}" - - headers = {"Content-Type": "application/json"} - auth = None - if self._http_auth: - if isinstance(self._http_auth, tuple): - auth = self._http_auth - elif isinstance(self._http_auth, AWSAuth): - # For AWS auth, we need to use the opensearchpy client - # Fall through to the try/except below - pass - + url, headers, auth = self._prepare_sql_http_request_params(base_url, response_format) + verify = self._verify_certs if self._verify_certs is not None else True timeout = self._timeout if self._timeout is not None else 30.0 response = requests.post( @@ -1463,12 +1486,12 @@ def query_sql(self, query: str, response_format: ResponseFormat = "json") -> Any except Exception as e: # If requests fails (e.g., AWS auth), fall back to opensearchpy # which will raise SerializationError that we can handle - pass - + logger.error(f"Failed to execute SQL query in OpenSearch: {e!s}") + try: body = {"query": query} params = {"format": response_format} - + response_data = self._client.transport.perform_request( method="POST", url="/_plugins/_sql", @@ -1476,20 +1499,15 @@ def query_sql(self, query: str, response_format: ResponseFormat = "json") -> Any body=body, ) - if response_format == "json": - # extract only the query results - if isinstance(response_data, dict) and "hits" in response_data: - hits = response_data.get("hits", {}).get("hits", []) - # extract _source from each hit, which contains the actual document data - return [hit.get("_source", {}) for hit in hits] - return response_data - else: - return response_data if isinstance(response_data, str) else str(response_data) + return self._process_sql_response(response_data, response_format) except SerializationError: # If we get here, it means requests failed above (likely AWS auth) # and opensearchpy can't deserialize the response # Re-raise as DocumentStoreError with a helpful message - msg = f"Failed to execute SQL query in OpenSearch: Unable to deserialize {response_format} response. This format may not be supported with the current authentication method." + msg = ( + f"Failed to execute SQL query in OpenSearch: Unable to deserialize {response_format} response. " + f"This format may not be supported with the current authentication method." + ) raise DocumentStoreError(msg) from None except Exception as e: msg = f"Failed to execute SQL query in OpenSearch: {e!s}" @@ -1510,26 +1528,14 @@ async def query_sql_async(self, query: str, response_format: ResponseFormat = "j # For non-JSON formats, use httpx directly to avoid deserialization issues if response_format != "json": try: - import httpx - # Get connection info from the transport connection = self._async_client.transport.get_connection() base_url = connection.host - url = f"{base_url}/_plugins/_sql?format={response_format}" - - headers = {"Content-Type": "application/json"} - auth = None - if self._http_auth: - if isinstance(self._http_auth, tuple): - auth = self._http_auth - elif isinstance(self._http_auth, AWSAuth): - # For AWS auth, we need to use the opensearchpy client - # Fall through to the try/except below - pass - + url, headers, auth = self._prepare_sql_http_request_params(base_url, response_format) + verify = self._verify_certs if self._verify_certs is not None else True timeout = httpx.Timeout(self._timeout if self._timeout else 30.0) - + async with httpx.AsyncClient(verify=verify, timeout=timeout) as client: response = await client.post( url, @@ -1545,12 +1551,12 @@ async def query_sql_async(self, query: str, response_format: ResponseFormat = "j except Exception as e: # If httpx fails (e.g., AWS auth), fall back to opensearchpy # which will raise SerializationError that we can handle - pass + logger.error(f"Failed to execute SQL query in OpenSearch: {e!s}") try: body = {"query": query} params = {"format": response_format} - + response_data = await self._async_client.transport.perform_request( method="POST", url="/_plugins/_sql", @@ -1558,20 +1564,16 @@ async def query_sql_async(self, query: str, response_format: ResponseFormat = "j body=body, ) - if response_format == "json": - # extract only the query results - if isinstance(response_data, dict) and "hits" in response_data: - hits = response_data.get("hits", {}).get("hits", []) - # extract _source from each hit, which contains the actual document data - return [hit.get("_source", {}) for hit in hits] - return response_data - else: - return response_data if isinstance(response_data, str) else str(response_data) + return self._process_sql_response(response_data, response_format) except SerializationError: # If we get here, it means httpx failed above (likely AWS auth or not installed) # and opensearchpy can't deserialize the response # Re-raise as DocumentStoreError with a helpful message - msg = f"Failed to execute SQL query in OpenSearch: Unable to deserialize {response_format} response. This format may not be supported with the current authentication method. Consider installing httpx for better support." + msg = ( + f"Failed to execute SQL query in OpenSearch: Unable to deserialize {response_format} response. " + f"This format may not be supported with the current authentication method. " + f"Consider installing httpx for better support." + ) raise DocumentStoreError(msg) from None except Exception as e: msg = f"Failed to execute SQL query in OpenSearch: {e!s}" From 873a4dc50061c9cd48e0612920ef7aa978e9a30a Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 6 Jan 2026 10:55:34 +0100 Subject: [PATCH 13/58] cleaning up --- .../opensearch/document_store.py | 32 ++++++------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index c172047bbf..e80193f2fc 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1093,9 +1093,6 @@ async def count_documents_by_filter_async(self, filters: dict) -> int: def _build_cardinality_aggregations(index_mapping: dict[str, Any]) -> dict[str, Any]: """ Builds cardinality aggregations for all metadata fields in the index mapping. - - :param index_mapping: The properties mapping from the index. - :returns: Dictionary of aggregations keyed by field name. """ special_fields = {"content", "embedding", "id", "score", "blob", "sparse_embedding"} aggs = {} @@ -1108,10 +1105,6 @@ def _build_cardinality_aggregations(index_mapping: dict[str, Any]) -> dict[str, def _build_distinct_values_query_body(filters: dict, aggs: dict[str, Any]) -> dict[str, Any]: """ Builds the query body for distinct values counting with filters and aggregations. - - :param filters: The filters to apply, or empty dict for no filters. - :param aggs: The aggregations to include in the query. - :returns: The query body dictionary. """ if filters: normalized_filters = normalize_filters(filters) @@ -1134,10 +1127,6 @@ def _extract_distinct_counts_from_aggregations( ) -> dict[str, int]: """ Extracts distinct value counts from search result aggregations. - - :param aggregations: The aggregations from the search result. - :param index_mapping: The properties mapping from the index. - :returns: Dictionary mapping field names to their distinct value counts. """ special_fields = {"content", "embedding", "id", "score", "blob", "sparse_embedding"} distinct_counts = {} @@ -1459,6 +1448,9 @@ def query_sql(self, query: str, response_format: ResponseFormat = "json") -> Any :param response_format: The format of the response. See https://docs.opensearch.org/latest/search-plugins/sql/response-formats/ :returns: The query results in the specified format. For JSON format, returns a list of dictionaries (the _source from each hit). For other formats (csv, jdbc, raw), returns the response as text. + + NOTE: For non-JSON formats (csv, jdbc, raw), use requests to make a raw HTTP request and get the text response + This avoids deserialization issues with the opensearchpy client. """ self._ensure_initialized() assert self._client is not None @@ -1501,9 +1493,8 @@ def query_sql(self, query: str, response_format: ResponseFormat = "json") -> Any return self._process_sql_response(response_data, response_format) except SerializationError: - # If we get here, it means requests failed above (likely AWS auth) - # and opensearchpy can't deserialize the response - # Re-raise as DocumentStoreError with a helpful message + # If we get here, it means requests failed above (likely AWS auth) and opensearchpy can't deserialize the + # response. Re-raise as DocumentStoreError with a helpful message msg = ( f"Failed to execute SQL query in OpenSearch: Unable to deserialize {response_format} response. " f"This format may not be supported with the current authentication method." @@ -1521,6 +1512,9 @@ async def query_sql_async(self, query: str, response_format: ResponseFormat = "j :param response_format: The format of the response. See https://docs.opensearch.org/latest/search-plugins/sql/response-formats/ :returns: The query results in the specified format. For JSON format, returns a list of dictionaries (the _source from each hit). For other formats (csv, jdbc, raw), returns the response as text. + + NOTE: For non-JSON formats (csv, jdbc, raw), use httpx AsyncClient to make a raw HTTP request and get the text + response. This avoids deserialization issues with the opensearchpy client. """ await self._ensure_initialized_async() assert self._async_client is not None @@ -1545,12 +1539,7 @@ async def query_sql_async(self, query: str, response_format: ResponseFormat = "j ) response.raise_for_status() return response.text - except ImportError: - # httpx not available, fall through to opensearchpy - pass except Exception as e: - # If httpx fails (e.g., AWS auth), fall back to opensearchpy - # which will raise SerializationError that we can handle logger.error(f"Failed to execute SQL query in OpenSearch: {e!s}") try: @@ -1566,9 +1555,8 @@ async def query_sql_async(self, query: str, response_format: ResponseFormat = "j return self._process_sql_response(response_data, response_format) except SerializationError: - # If we get here, it means httpx failed above (likely AWS auth or not installed) - # and opensearchpy can't deserialize the response - # Re-raise as DocumentStoreError with a helpful message + # If we get here, it means httpx failed above (likely AWS auth or not installed) and opensearchpy can't + # deserialize the response. Re-raise as DocumentStoreError with a helpful message msg = ( f"Failed to execute SQL query in OpenSearch: Unable to deserialize {response_format} response. " f"This format may not be supported with the current authentication method. " From 1f3347bcb0a489a33aaa45b43dabb1e9017786b0 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 6 Jan 2026 10:56:47 +0100 Subject: [PATCH 14/58] adding httpx as a dependency --- integrations/opensearch/pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/integrations/opensearch/pyproject.toml b/integrations/opensearch/pyproject.toml index 466863c445..e3f93af0b7 100644 --- a/integrations/opensearch/pyproject.toml +++ b/integrations/opensearch/pyproject.toml @@ -25,7 +25,8 @@ classifiers = [ ] dependencies = [ "haystack-ai>=2.14.0", - "opensearch-py[async]>=2.4.0,<3"] + "opensearch-py[async]>=2.4.0,<3"], + "httpx>=0.28.1" [project.urls] Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/opensearch#readme" From 3622168b28ba40dbbe526b35b8491b8eb5e34f57 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 6 Jan 2026 11:03:45 +0100 Subject: [PATCH 15/58] fixing pyproject.toml --- integrations/opensearch/pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/integrations/opensearch/pyproject.toml b/integrations/opensearch/pyproject.toml index e3f93af0b7..db8bcfd19e 100644 --- a/integrations/opensearch/pyproject.toml +++ b/integrations/opensearch/pyproject.toml @@ -25,8 +25,9 @@ classifiers = [ ] dependencies = [ "haystack-ai>=2.14.0", - "opensearch-py[async]>=2.4.0,<3"], + "opensearch-py[async]>=2.4.0,<3", "httpx>=0.28.1" +] [project.urls] Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/opensearch#readme" From d96cc4cf85c57b02e8b7535c62c528258a63e683 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 7 Jan 2026 16:32:39 +0100 Subject: [PATCH 16/58] updating tests: making use of the new refresh feature --- integrations/opensearch/tests/test_document_store.py | 3 +-- integrations/opensearch/tests/test_document_store_async.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index 3c25636351..4fa6bca5cb 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -781,8 +781,7 @@ def test_query_sql(self, document_store: OpenSearchDocumentStore): Document(content="Python scripting", meta={"category": "A", "status": "inactive", "priority": 3}), Document(content="JavaScript development", meta={"category": "C", "status": "active", "priority": 1}), ] - document_store.write_documents(docs) - time.sleep(1) # Wait for documents to be indexed + document_store.write_documents(docs, refresh=True) # Test SQL query with JSON format (default) sql_query = ( diff --git a/integrations/opensearch/tests/test_document_store_async.py b/integrations/opensearch/tests/test_document_store_async.py index be17577d34..fd37a6dcd2 100644 --- a/integrations/opensearch/tests/test_document_store_async.py +++ b/integrations/opensearch/tests/test_document_store_async.py @@ -581,8 +581,7 @@ async def test_query_sql(self, document_store: OpenSearchDocumentStore): Document(content="Python scripting", meta={"category": "A", "status": "inactive", "priority": 3}), Document(content="JavaScript development", meta={"category": "C", "status": "active", "priority": 1}), ] - await document_store.write_documents_async(docs) - time.sleep(1) # Wait for documents to be indexed + await document_store.write_documents_async(docs, refresh=True) # Test SQL query with JSON format (default) sql_query = ( From 69863d0a539aba8f3d8a634b3e58c187d554fc23 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 8 Jan 2026 10:55:08 +0100 Subject: [PATCH 17/58] dealing with special fields --- .../document_stores/opensearch/document_store.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 3ca8e9d813..82a4c6331a 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -24,6 +24,7 @@ logger = logging.getLogger(__name__) +SPECIAL_FIELDS = {"content", "embedding", "id", "score", "sparse_embedding", "blob"} Hosts = Union[str, list[Union[str, Mapping[str, Union[str, int]]]]] @@ -351,8 +352,6 @@ async def count_documents_async(self) -> int: @staticmethod def _deserialize_search_hits(hits: list[dict[str, Any]]) -> list[Document]: out = [] - # Fields that are not metadata (should stay at top level) - non_meta_fields = {"id", "content", "embedding", "blob", "sparse_embedding", "score"} for hit in hits: data = hit["_source"].copy() @@ -361,7 +360,7 @@ def _deserialize_search_hits(hits: list[dict[str, Any]]) -> list[Document]: meta = {} fields_to_remove = [] for key, value in data.items(): - if key not in non_meta_fields: + if key not in SPECIAL_FIELDS: meta[key] = value fields_to_remove.append(key) @@ -1203,10 +1202,9 @@ def _build_cardinality_aggregations(index_mapping: dict[str, Any]) -> dict[str, """ Builds cardinality aggregations for all metadata fields in the index mapping. """ - special_fields = {"content", "embedding", "id", "score", "blob", "sparse_embedding"} aggs = {} for field_name in index_mapping.keys(): - if field_name not in special_fields: + if field_name not in SPECIAL_FIELDS: aggs[f"{field_name}_cardinality"] = {"cardinality": {"field": field_name}} return aggs @@ -1237,10 +1235,9 @@ def _extract_distinct_counts_from_aggregations( """ Extracts distinct value counts from search result aggregations. """ - special_fields = {"content", "embedding", "id", "score", "blob", "sparse_embedding"} distinct_counts = {} for field_name in index_mapping.keys(): - if field_name not in special_fields: + if field_name not in SPECIAL_FIELDS: agg_key = f"{field_name}_cardinality" if agg_key in aggregations: distinct_counts[field_name] = aggregations[agg_key]["value"] From 3a3df4c025ac9e4d7a5057857eff5cf4d7417124 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 8 Jan 2026 10:56:04 +0100 Subject: [PATCH 18/58] docstring update --- .../document_stores/opensearch/document_store.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 82a4c6331a..876c6cfa95 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1201,6 +1201,8 @@ async def count_documents_by_filter_async(self, filters: dict) -> int: def _build_cardinality_aggregations(index_mapping: dict[str, Any]) -> dict[str, Any]: """ Builds cardinality aggregations for all metadata fields in the index mapping. + + See: https://docs.opensearch.org/latest/aggregations/metric/cardinality/ """ aggs = {} for field_name in index_mapping.keys(): From 6b2081b9ec27387e49ebf36ce08994e338bb2073 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 8 Jan 2026 15:00:25 +0100 Subject: [PATCH 19/58] adding roundtrip tests to assert documents metadata is correctly written and retrieved --- .../opensearch/document_store.py | 23 +------- .../opensearch/tests/test_bm25_retriever.py | 54 +++++++++++++++++++ .../opensearch/tests/test_document_store.py | 2 - .../tests/test_embedding_retriever.py | 48 +++++++++++++++++ 4 files changed, 104 insertions(+), 23 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 876c6cfa95..45cf600b18 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -352,29 +352,10 @@ async def count_documents_async(self) -> int: @staticmethod def _deserialize_search_hits(hits: list[dict[str, Any]]) -> list[Document]: out = [] - for hit in hits: - data = hit["_source"].copy() - - # Reconstruct metadata dict from flattened fields - meta = {} - fields_to_remove = [] - for key, value in data.items(): - if key not in SPECIAL_FIELDS: - meta[key] = value - fields_to_remove.append(key) - - # Remove metadata fields from top level and add them to meta - for key in fields_to_remove: - data.pop(key, None) - - if meta: - data["meta"] = meta - + data = hit["_source"] if "highlight" in hit: - if "meta" not in data: - data["meta"] = {} - data["meta"]["highlighted"] = hit["highlight"] + data["metadata"]["highlighted"] = hit["highlight"] data["score"] = hit["_score"] out.append(Document.from_dict(data)) diff --git a/integrations/opensearch/tests/test_bm25_retriever.py b/integrations/opensearch/tests/test_bm25_retriever.py index 03235c36b3..d6bb6350e4 100644 --- a/integrations/opensearch/tests/test_bm25_retriever.py +++ b/integrations/opensearch/tests/test_bm25_retriever.py @@ -424,6 +424,60 @@ def test_bm25_retriever_runtime_document_store_switching( assert len(results_1_again["documents"]) == 1 +@pytest.mark.integration +def test_bm25_retriever_document_structure_with_metadata(document_store): + """ + Test document structure with complex metadata (nested values, lists, etc.) + """ + docs = [ + Document( + content="Python is versatile", + meta={ + "category": "programming", + "tags": ["python", "general-purpose"], + "rating": 4.5, + "active": True, + "author": {"name": "John", "role": "developer"}, + }, + id="python_doc", + ), + Document( + content="JavaScript is dynamic", + meta={ + "category": "programming", + "tags": ["javascript", "web"], + "rating": 4.8, + "active": True, + }, + id="js_doc", + ), + ] + document_store.write_documents(docs, refresh=True) + retriever = OpenSearchBM25Retriever(document_store=document_store) + + results = retriever.run(query="programming", top_k=2) + assert len(results["documents"]) == 2 + + for doc in results["documents"]: + # Verify structure + assert hasattr(doc, "content") + assert hasattr(doc, "meta") + assert isinstance(doc.meta, dict) + + # Verify complex metadata is preserved + assert "category" in doc.meta + assert "tags" in doc.meta + assert isinstance(doc.meta["tags"], list) + assert "rating" in doc.meta + + # Verify document can be serialized/deserialized + doc_dict = doc.to_dict() + doc_from_dict = Document.from_dict(doc_dict) + assert doc_from_dict.content == doc.content + assert doc_from_dict.meta == doc.meta + assert doc_from_dict.id == doc.id + + @pytest.mark.asyncio @pytest.mark.integration async def test_bm25_retriever_async_runtime_document_store_switching( diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index 91500edff9..6fb562608c 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -615,7 +615,6 @@ def test_update_by_filter(self, document_store: OpenSearchDocumentStore): assert len(draft_docs) == 1 assert draft_docs[0].meta["category"] == "B" - def test_count_documents_by_filter(self, document_store: OpenSearchDocumentStore): docs = [ Document(content="Doc 1", meta={"category": "A", "status": "active"}), @@ -914,4 +913,3 @@ def test_delete_with_routing(self, document_store: OpenSearchDocumentStore): document_store.delete_documents(["1", "2"], routing=routing_map) assert document_store.count_documents() == 1 - diff --git a/integrations/opensearch/tests/test_embedding_retriever.py b/integrations/opensearch/tests/test_embedding_retriever.py index a01b7dc008..fe2f5865c6 100644 --- a/integrations/opensearch/tests/test_embedding_retriever.py +++ b/integrations/opensearch/tests/test_embedding_retriever.py @@ -404,3 +404,51 @@ async def test_embedding_retriever_runtime_document_store_switching_async( python_query_embedding = [0.4, 0.4, 0.4] + [0.0] * 765 results_1_again = await retriever.run_async(query_embedding=python_query_embedding) assert "Python" in results_1_again["documents"][0].content + + +@pytest.mark.integration +def test_embedding_retriever_document_structure_with_metadata(document_store, test_documents_with_embeddings_1): + """ + Test that documents returned by embedding retriever have correct structure: + - Metadata fields are in doc.meta (not at top level) + - Special fields (content, embedding, id, score) are at top level + - All original metadata is preserved + """ + document_store.write_documents(test_documents_with_embeddings_1, refresh=True) + retriever = OpenSearchEmbeddingRetriever(document_store=document_store) + + # Query embedding to match functional programming languages + query_embedding = [0.2, 0.3, 0.4] + [0.0] * 765 + results = retriever.run(query_embedding=query_embedding, top_k=5) + + assert len(results["documents"]) > 0 + + for doc in results["documents"]: + # Verify special fields are at top level + assert hasattr(doc, "content") + assert isinstance(doc.content, str) + assert hasattr(doc, "id") + assert isinstance(doc.id, str) + assert hasattr(doc, "score") + assert doc.score is not None + assert hasattr(doc, "embedding") + assert isinstance(doc.embedding, list) + assert len(doc.embedding) == 768 + + # Verify metadata fields are in meta dict (not at top level) + assert hasattr(doc, "meta") + assert isinstance(doc.meta, dict) + + # Verify original metadata is preserved + assert "likes" in doc.meta + assert "language_type" in doc.meta + assert isinstance(doc.meta["likes"], int) + assert isinstance(doc.meta["language_type"], str) + + # Verify document can be serialized/deserialized + doc_dict = doc.to_dict() + doc_from_dict = Document.from_dict(doc_dict) + assert doc_from_dict.content == doc.content + assert doc_from_dict.meta == doc.meta + assert doc_from_dict.id == doc.id + assert doc_from_dict.embedding == doc.embedding From 923081e86677f40a6552f17636a18d7b41f3fed1 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 10:20:45 +0100 Subject: [PATCH 20/58] Update integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- .../document_stores/opensearch/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 45cf600b18..ff1e2118c0 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1148,7 +1148,7 @@ def _render_custom_query(self, custom_query: Any, substitutions: dict[str, Any]) return custom_query - def count_documents_by_filter(self, filters: dict) -> int: + def count_documents_by_filter(self, filters: dict[str, Any]) -> int: """ Returns the number of documents that match the provided filters. From abd4b7f3c5b1a5d0a3e24e9bece44e64c4bfa505 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 10:20:55 +0100 Subject: [PATCH 21/58] Update integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- .../document_stores/opensearch/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index ff1e2118c0..1ab9b42e83 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1163,7 +1163,7 @@ def count_documents_by_filter(self, filters: dict[str, Any]) -> int: body = {"query": {"bool": {"filter": normalized_filters}}} return self._client.count(index=self._index, body=body)["count"] - async def count_documents_by_filter_async(self, filters: dict) -> int: + async def count_documents_by_filter_async(self, filters: dict[str, Any]) -> int: """ Asynchronously returns the number of documents that match the provided filters. From dfcb8ec87fa296620075f963f575339e89efab21 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 10:31:43 +0100 Subject: [PATCH 22/58] updating function names --- .../document_stores/opensearch/document_store.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 1ab9b42e83..e1532b10be 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1281,7 +1281,7 @@ async def count_distinct_values_by_filter_async(self, filters: dict) -> dict[str # extract cardinality values from aggregations return self._extract_distinct_counts_from_aggregations(result.get("aggregations", {}), index_mapping) - def get_fields_info(self) -> dict[str, dict]: + def get_meta_fields_info(self) -> dict[str, dict]: """ Returns the information about the fields in the index. @@ -1340,7 +1340,7 @@ def _extract_min_max_from_stats(stats: dict[str, Any]) -> dict[str, Any]: max_value = stats.get("max") return {"min": min_value, "max": max_value} - def get_field_min_max(self, metadata_field: str) -> dict[str, Any]: + def get_metadata_field_min_max(self, metadata_field: str) -> dict[str, Any]: """ Returns the minimum and maximum values for the given metadata field. @@ -1374,7 +1374,7 @@ async def get_field_min_max_async(self, metadata_field: str) -> dict[str, Any]: return self._extract_min_max_from_stats(stats) - def get_field_unique_values( + def get_metadata_field_unique_values( self, metadata_field: str, search_term: str | None, from_: int, size: int ) -> tuple[list[str], int]: """ From a926d03f2c99c006b657a32841a57bca9fa25af6 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 11:36:58 +0100 Subject: [PATCH 23/58] updating function names + tests --- .../opensearch/document_store.py | 12 ++--- .../opensearch/tests/test_document_store.py | 48 +++++++++++-------- .../tests/test_document_store_async.py | 45 +++++++++-------- 3 files changed, 58 insertions(+), 47 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index e1532b10be..872953c1b5 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1226,7 +1226,7 @@ def _extract_distinct_counts_from_aggregations( distinct_counts[field_name] = aggregations[agg_key]["value"] return distinct_counts - def count_distinct_values_by_filter(self, filters: dict) -> dict[str, int]: + def count_distinct_metadata_values_by_filter(self, filters: dict) -> dict[str, int]: """ Returns the number of unique values for each meta field of the documents that match the provided filters. @@ -1253,7 +1253,7 @@ def count_distinct_values_by_filter(self, filters: dict) -> dict[str, int]: # extract cardinality values from aggregations return self._extract_distinct_counts_from_aggregations(result.get("aggregations", {}), index_mapping) - async def count_distinct_values_by_filter_async(self, filters: dict) -> dict[str, int]: + async def count_distinct_metadata_values_by_filter_async(self, filters: dict) -> dict[str, int]: """ Asynchronously returns the number of unique values for each meta field of the documents that match the provided filters. @@ -1281,7 +1281,7 @@ async def count_distinct_values_by_filter_async(self, filters: dict) -> dict[str # extract cardinality values from aggregations return self._extract_distinct_counts_from_aggregations(result.get("aggregations", {}), index_mapping) - def get_meta_fields_info(self) -> dict[str, dict]: + def get_metadata_fields_info(self) -> dict[str, dict]: """ Returns the information about the fields in the index. @@ -1294,7 +1294,7 @@ def get_meta_fields_info(self) -> dict[str, dict]: index_mapping = mapping[self._index]["mappings"]["properties"] return index_mapping - async def get_fields_info_async(self) -> dict[str, dict]: + async def get_metadata_fields_info_async(self) -> dict[str, dict]: """ Asynchronously returns the information about the fields in the index. @@ -1357,7 +1357,7 @@ def get_metadata_field_min_max(self, metadata_field: str) -> dict[str, Any]: return self._extract_min_max_from_stats(stats) - async def get_field_min_max_async(self, metadata_field: str) -> dict[str, Any]: + async def get_metadata_field_min_max_async(self, metadata_field: str) -> dict[str, Any]: """ Asynchronously returns the minimum and maximum values for the given metadata field. @@ -1435,7 +1435,7 @@ def get_metadata_field_unique_values( return unique_values, total_count - async def get_field_unique_values_async( + async def get_metadata_field_unique_values_async( self, metadata_field: str, search_term: str | None, from_: int, size: int ) -> tuple[list[str], int]: """ diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index 3e224c4982..70a9696318 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -619,7 +619,7 @@ def test_count_documents_by_filter(self, document_store: OpenSearchDocumentStore ) assert count_a_active == 2 - def test_count_distinct_values_by_filter(self, document_store: OpenSearchDocumentStore): + def test_count_distinct_metadata_values_by_filter(self, document_store: OpenSearchDocumentStore): docs = [ Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1}), Document(content="Doc 2", meta={"category": "B", "status": "active", "priority": 2}), @@ -631,13 +631,13 @@ def test_count_distinct_values_by_filter(self, document_store: OpenSearchDocumen assert document_store.count_documents() == 5 # Count distinct values for all documents - distinct_counts = document_store.count_distinct_values_by_filter(filters={}) + distinct_counts = document_store.count_distinct_metadata_values_by_filter(filters={}) assert distinct_counts["category"] == 3 # A, B, C assert distinct_counts["status"] == 2 # active, inactive assert distinct_counts["priority"] == 3 # 1, 2, 3 # Count distinct values for documents with category="A" - distinct_counts_a = document_store.count_distinct_values_by_filter( + distinct_counts_a = document_store.count_distinct_metadata_values_by_filter( filters={"field": "meta.category", "operator": "==", "value": "A"} ) assert distinct_counts_a["category"] == 1 # Only A @@ -645,7 +645,7 @@ def test_count_distinct_values_by_filter(self, document_store: OpenSearchDocumen assert distinct_counts_a["priority"] == 2 # 1, 3 # Count distinct values for documents with status="active" - distinct_counts_active = document_store.count_distinct_values_by_filter( + distinct_counts_active = document_store.count_distinct_metadata_values_by_filter( filters={"field": "meta.status", "operator": "==", "value": "active"} ) assert distinct_counts_active["category"] == 3 # A, B, C @@ -653,7 +653,7 @@ def test_count_distinct_values_by_filter(self, document_store: OpenSearchDocumen assert distinct_counts_active["priority"] == 3 # 1, 2, 3 # Count distinct values with complex filter (category="A" AND status="active") - distinct_counts_a_active = document_store.count_distinct_values_by_filter( + distinct_counts_a_active = document_store.count_distinct_metadata_values_by_filter( filters={ "operator": "AND", "conditions": [ @@ -666,14 +666,14 @@ def test_count_distinct_values_by_filter(self, document_store: OpenSearchDocumen assert distinct_counts_a_active["status"] == 1 # Only active assert distinct_counts_a_active["priority"] == 2 # 1, 3 - def test_get_fields_info(self, document_store: OpenSearchDocumentStore): + def test_get_metadata_fields_info(self, document_store: OpenSearchDocumentStore): docs = [ Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1}), Document(content="Doc 2", meta={"category": "B", "status": "inactive"}), ] document_store.write_documents(docs) - fields_info = document_store.get_fields_info() + fields_info = document_store.get_metadata_fields_info() # Verify that fields_info contains expected fields assert "content" in fields_info @@ -690,7 +690,7 @@ def test_get_fields_info(self, document_store: OpenSearchDocumentStore): assert fields_info["status"]["type"] == "keyword" assert fields_info["priority"]["type"] == "long" - def test_get_field_min_max(self, document_store: OpenSearchDocumentStore): + def test_get_metadata_field_min_max(self, document_store: OpenSearchDocumentStore): # Test with integer values docs = [ Document(content="Doc 1", meta={"priority": 1, "age": 10}), @@ -705,28 +705,28 @@ def test_get_field_min_max(self, document_store: OpenSearchDocumentStore): document_store.write_documents(docs) # Test with "meta." prefix for integer field - min_max_priority = document_store.get_field_min_max("meta.priority") + min_max_priority = document_store.get_metadata_field_min_max("meta.priority") assert min_max_priority["min"] == 1 assert min_max_priority["max"] == 10 # Test with "meta." prefix for another integer field - min_max_rating = document_store.get_field_min_max("meta.age") + min_max_rating = document_store.get_metadata_field_min_max("meta.age") assert min_max_rating["min"] == 5 assert min_max_rating["max"] == 20 # Test with single value single_doc = [Document(content="Doc 5", meta={"single_value": 42})] document_store.write_documents(single_doc) - min_max_single = document_store.get_field_min_max("meta.single_value") + min_max_single = document_store.get_metadata_field_min_max("meta.single_value") assert min_max_single["min"] == 42 assert min_max_single["max"] == 42 # Test with float values - min_max_score = document_store.get_field_min_max("meta.rating") + min_max_score = document_store.get_metadata_field_min_max("meta.rating") assert min_max_score["min"] == pytest.approx(5.2) assert min_max_score["max"] == pytest.approx(20.3) - def test_get_field_unique_values(self, document_store: OpenSearchDocumentStore): + def test_get_metadata_field_unique_values(self, document_store: OpenSearchDocumentStore): # Test with string values docs = [ Document(content="Python programming", meta={"category": "A", "language": "Python"}), @@ -739,34 +739,38 @@ def test_get_field_unique_values(self, document_store: OpenSearchDocumentStore): document_store.write_documents(docs) # Test getting all unique values without search term - unique_values, total_count = document_store.get_field_unique_values("meta.category", None, 0, 10) + unique_values, total_count = document_store.get_metadata_field_unique_values("meta.category", None, 0, 10) assert set(unique_values) == {"A", "B", "C"} assert total_count == 3 # Test with "meta." prefix - unique_languages, lang_count = document_store.get_field_unique_values("meta.language", None, 0, 10) + unique_languages, lang_count = document_store.get_metadata_field_unique_values("meta.language", None, 0, 10) assert set(unique_languages) == {"Python", "Java", "JavaScript"} assert lang_count == 3 # Test pagination - first page - unique_values_page1, total_count = document_store.get_field_unique_values("meta.category", None, 0, 2) + unique_values_page1, total_count = document_store.get_metadata_field_unique_values("meta.category", None, 0, 2) assert len(unique_values_page1) == 2 assert total_count == 3 assert all(val in ["A", "B", "C"] for val in unique_values_page1) # Test pagination - second page - unique_values_page2, total_count = document_store.get_field_unique_values("meta.category", None, 2, 2) + unique_values_page2, total_count = document_store.get_metadata_field_unique_values("meta.category", None, 2, 2) assert len(unique_values_page2) == 1 assert total_count == 3 assert unique_values_page2[0] in ["A", "B", "C"] # Test with search term - filter by content matching "Python" - unique_values_filtered, total_count = document_store.get_field_unique_values("meta.category", "Python", 0, 10) + unique_values_filtered, total_count = document_store.get_metadata_field_unique_values( + "meta.category", "Python", 0, 10 + ) assert set(unique_values_filtered) == {"A"} # Only category A has documents with "Python" in content assert total_count == 1 # Test with search term - filter by content matching "Java" - unique_values_java, total_count = document_store.get_field_unique_values("meta.category", "Java", 0, 10) + unique_values_java, total_count = document_store.get_metadata_field_unique_values( + "meta.category", "Java", 0, 10 + ) assert set(unique_values_java) == {"B"} # Only category B has documents with "Java" in content assert total_count == 1 @@ -778,12 +782,14 @@ def test_get_field_unique_values(self, document_store: OpenSearchDocumentStore): Document(content="Doc 4", meta={"priority": 3}), ] document_store.write_documents(int_docs) - unique_priorities, priority_count = document_store.get_field_unique_values("meta.priority", None, 0, 10) + unique_priorities, priority_count = document_store.get_metadata_field_unique_values( + "meta.priority", None, 0, 10 + ) assert set(unique_priorities) == {"1", "2", "3"} assert priority_count == 3 # Test with search term on integer field - unique_priorities_filtered, priority_count = document_store.get_field_unique_values( + unique_priorities_filtered, priority_count = document_store.get_metadata_field_unique_values( "meta.priority", "Doc 1", 0, 10 ) assert set(unique_priorities_filtered) == {"1"} diff --git a/integrations/opensearch/tests/test_document_store_async.py b/integrations/opensearch/tests/test_document_store_async.py index 9c0fbe5b54..05df97fe75 100644 --- a/integrations/opensearch/tests/test_document_store_async.py +++ b/integrations/opensearch/tests/test_document_store_async.py @@ -282,13 +282,13 @@ async def test_count_distinct_values_by_filter(self, document_store: OpenSearchD assert await document_store.count_documents_async() == 5 # count distinct values for all documents - distinct_counts = await document_store.count_distinct_values_by_filter_async(filters={}) + distinct_counts = await document_store.count_distinct_metadata_values_by_filter_async(filters={}) assert distinct_counts["category"] == 3 # A, B, C assert distinct_counts["status"] == 2 # active, inactive assert distinct_counts["priority"] == 3 # 1, 2, 3 # count distinct values for documents with category="A" - distinct_counts_a = await document_store.count_distinct_values_by_filter_async( + distinct_counts_a = await document_store.count_distinct_metadata_values_by_filter_async( filters={"field": "meta.category", "operator": "==", "value": "A"} ) assert distinct_counts_a["category"] == 1 # Only A @@ -296,7 +296,7 @@ async def test_count_distinct_values_by_filter(self, document_store: OpenSearchD assert distinct_counts_a["priority"] == 2 # 1, 3 # count distinct values for documents with status="active" - distinct_counts_active = await document_store.count_distinct_values_by_filter_async( + distinct_counts_active = await document_store.count_distinct_metadata_values_by_filter_async( filters={"field": "meta.status", "operator": "==", "value": "active"} ) assert distinct_counts_active["category"] == 3 # A, B, C @@ -304,7 +304,7 @@ async def test_count_distinct_values_by_filter(self, document_store: OpenSearchD assert distinct_counts_active["priority"] == 3 # 1, 2, 3 # count distinct values with complex filter (category="A" AND status="active") - distinct_counts_a_active = await document_store.count_distinct_values_by_filter_async( + distinct_counts_a_active = await document_store.count_distinct_metadata_values_by_filter_async( filters={ "operator": "AND", "conditions": [ @@ -474,14 +474,14 @@ async def test_update_by_filter_async(self, document_store: OpenSearchDocumentSt assert draft_docs[0].meta["category"] == "B" @pytest.mark.asyncio - async def test_get_fields_info(self, document_store: OpenSearchDocumentStore): + async def test_get_metadata_fields_info_async(self, document_store: OpenSearchDocumentStore): filterable_docs = [ Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1}), Document(content="Doc 2", meta={"category": "B", "status": "inactive"}), ] await document_store.write_documents_async(filterable_docs) - fields_info = await document_store.get_fields_info_async() + fields_info = await document_store.get_metadata_fields_info_async() # Verify that fields_info contains expected fields assert "content" in fields_info @@ -493,13 +493,14 @@ async def test_get_fields_info(self, document_store: OpenSearchDocumentStore): # Verify field types assert fields_info["content"]["type"] == "text" assert fields_info["embedding"]["type"] == "knn_vector" + # Metadata fields should be keyword type (from dynamic templates) assert fields_info["category"]["type"] == "keyword" assert fields_info["status"]["type"] == "keyword" assert fields_info["priority"]["type"] == "long" @pytest.mark.asyncio - async def test_get_field_min_max(self, document_store: OpenSearchDocumentStore): + async def test_get_metadata_field_min_max_async(self, document_store: OpenSearchDocumentStore): # Test with integer values docs = [ Document(content="Doc 1", meta={"priority": 1, "age": 10}), @@ -514,29 +515,29 @@ async def test_get_field_min_max(self, document_store: OpenSearchDocumentStore): await document_store.write_documents_async(docs) # Test with "meta." prefix for integer field - min_max_priority = await document_store.get_field_min_max_async("meta.priority") + min_max_priority = await document_store.get_metadata_field_min_max_async("meta.priority") assert min_max_priority["min"] == 1 assert min_max_priority["max"] == 10 # Test with "meta." prefix for another integer field - min_max_rating = await document_store.get_field_min_max_async("meta.age") + min_max_rating = await document_store.get_metadata_field_min_max_async("meta.age") assert min_max_rating["min"] == 5 assert min_max_rating["max"] == 20 # Test with single value single_doc = [Document(content="Doc 5", meta={"single_value": 42})] await document_store.write_documents_async(single_doc) - min_max_single = await document_store.get_field_min_max_async("meta.single_value") + min_max_single = await document_store.get_metadata_field_min_max_async("meta.single_value") assert min_max_single["min"] == 42 assert min_max_single["max"] == 42 # Test with float values - min_max_score = await document_store.get_field_min_max_async("meta.rating") + min_max_score = await document_store.get_metadata_field_min_max_async("meta.rating") assert min_max_score["min"] == pytest.approx(5.2) assert min_max_score["max"] == pytest.approx(20.3) @pytest.mark.asyncio - async def test_get_field_unique_values(self, document_store: OpenSearchDocumentStore): + async def test_get_metadata_field_unique_values_async(self, document_store: OpenSearchDocumentStore): # Test with string values docs = [ Document(content="Python programming", meta={"category": "A", "language": "Python"}), @@ -549,17 +550,21 @@ async def test_get_field_unique_values(self, document_store: OpenSearchDocumentS await document_store.write_documents_async(docs) # Test getting all unique values without search term - unique_values, total_count = await document_store.get_field_unique_values_async("meta.category", None, 0, 10) + unique_values, total_count = await document_store.get_metadata_field_unique_values_async( + "meta.category", None, 0, 10 + ) assert set(unique_values) == {"A", "B", "C"} assert total_count == 3 # Test with "meta." prefix - unique_languages, lang_count = await document_store.get_field_unique_values_async("meta.language", None, 0, 10) + unique_languages, lang_count = await document_store.get_metadata_field_unique_values_async( + "meta.language", None, 0, 10 + ) assert set(unique_languages) == {"Python", "Java", "JavaScript"} assert lang_count == 3 # Test pagination - first page - unique_values_page1, total_count = await document_store.get_field_unique_values_async( + unique_values_page1, total_count = await document_store.get_metadata_field_unique_values_async( "meta.category", None, 0, 2 ) assert len(unique_values_page1) == 2 @@ -567,7 +572,7 @@ async def test_get_field_unique_values(self, document_store: OpenSearchDocumentS assert all(val in ["A", "B", "C"] for val in unique_values_page1) # Test pagination - second page - unique_values_page2, total_count = await document_store.get_field_unique_values_async( + unique_values_page2, total_count = await document_store.get_metadata_field_unique_values_async( "meta.category", None, 2, 2 ) assert len(unique_values_page2) == 1 @@ -575,14 +580,14 @@ async def test_get_field_unique_values(self, document_store: OpenSearchDocumentS assert unique_values_page2[0] in ["A", "B", "C"] # Test with search term - filter by content matching "Python" - unique_values_filtered, total_count = await document_store.get_field_unique_values_async( + unique_values_filtered, total_count = await document_store.get_metadata_field_unique_values_async( "meta.category", "Python", 0, 10 ) assert set(unique_values_filtered) == {"A"} # Only category A has documents with "Python" in content assert total_count == 1 # Test with search term - filter by content matching "Java" - unique_values_java, total_count = await document_store.get_field_unique_values_async( + unique_values_java, total_count = await document_store.get_metadata_field_unique_values_async( "meta.category", "Java", 0, 10 ) assert set(unique_values_java) == {"B"} # Only category B has documents with "Java" in content @@ -596,14 +601,14 @@ async def test_get_field_unique_values(self, document_store: OpenSearchDocumentS Document(content="Doc 4", meta={"priority": 3}), ] await document_store.write_documents_async(int_docs) - unique_priorities, priority_count = await document_store.get_field_unique_values_async( + unique_priorities, priority_count = await document_store.get_metadata_field_unique_values_async( "meta.priority", None, 0, 10 ) assert set(unique_priorities) == {"1", "2", "3"} assert priority_count == 3 # Test with search term on integer field - unique_priorities_filtered, priority_count = await document_store.get_field_unique_values_async( + unique_priorities_filtered, priority_count = await document_store.get_metadata_field_unique_values_async( "meta.priority", "Doc 1", 0, 10 ) assert set(unique_priorities_filtered) == {"1"} From 2a65a49a26560bdd4ec9cb43ce89d1bd7b13cb7d Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 11:42:12 +0100 Subject: [PATCH 24/58] Update integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- .../document_stores/opensearch/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 872953c1b5..b45850f9ae 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1192,7 +1192,7 @@ def _build_cardinality_aggregations(index_mapping: dict[str, Any]) -> dict[str, return aggs @staticmethod - def _build_distinct_values_query_body(filters: dict, aggs: dict[str, Any]) -> dict[str, Any]: + def _build_distinct_values_query_body(filters: Optional[dict[str, Any]], aggs: dict[str, Any]) -> dict[str, Any]: """ Builds the query body for distinct values counting with filters and aggregations. """ From 60406ac54664726bc8cc45d1578eb5031d4c6694 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 11:53:22 +0100 Subject: [PATCH 25/58] updating function names + tests --- .../document_stores/opensearch/document_store.py | 4 ++-- integrations/opensearch/tests/test_document_store.py | 10 +++++----- .../opensearch/tests/test_document_store_async.py | 10 +++++----- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index b45850f9ae..4c3b344076 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1226,7 +1226,7 @@ def _extract_distinct_counts_from_aggregations( distinct_counts[field_name] = aggregations[agg_key]["value"] return distinct_counts - def count_distinct_metadata_values_by_filter(self, filters: dict) -> dict[str, int]: + def count_unique_metadata_by_filter(self, filters: dict) -> dict[str, int]: """ Returns the number of unique values for each meta field of the documents that match the provided filters. @@ -1253,7 +1253,7 @@ def count_distinct_metadata_values_by_filter(self, filters: dict) -> dict[str, i # extract cardinality values from aggregations return self._extract_distinct_counts_from_aggregations(result.get("aggregations", {}), index_mapping) - async def count_distinct_metadata_values_by_filter_async(self, filters: dict) -> dict[str, int]: + async def count_unique_metadata_by_filter_async(self, filters: dict) -> dict[str, int]: """ Asynchronously returns the number of unique values for each meta field of the documents that match the provided filters. diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index 70a9696318..5a930de613 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -619,7 +619,7 @@ def test_count_documents_by_filter(self, document_store: OpenSearchDocumentStore ) assert count_a_active == 2 - def test_count_distinct_metadata_values_by_filter(self, document_store: OpenSearchDocumentStore): + def test_count_unique_metadata_by_filter(self, document_store: OpenSearchDocumentStore): docs = [ Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1}), Document(content="Doc 2", meta={"category": "B", "status": "active", "priority": 2}), @@ -631,13 +631,13 @@ def test_count_distinct_metadata_values_by_filter(self, document_store: OpenSear assert document_store.count_documents() == 5 # Count distinct values for all documents - distinct_counts = document_store.count_distinct_metadata_values_by_filter(filters={}) + distinct_counts = document_store.count_unique_metadata_by_filter(filters={}) assert distinct_counts["category"] == 3 # A, B, C assert distinct_counts["status"] == 2 # active, inactive assert distinct_counts["priority"] == 3 # 1, 2, 3 # Count distinct values for documents with category="A" - distinct_counts_a = document_store.count_distinct_metadata_values_by_filter( + distinct_counts_a = document_store.count_unique_metadata_by_filter( filters={"field": "meta.category", "operator": "==", "value": "A"} ) assert distinct_counts_a["category"] == 1 # Only A @@ -645,7 +645,7 @@ def test_count_distinct_metadata_values_by_filter(self, document_store: OpenSear assert distinct_counts_a["priority"] == 2 # 1, 3 # Count distinct values for documents with status="active" - distinct_counts_active = document_store.count_distinct_metadata_values_by_filter( + distinct_counts_active = document_store.count_unique_metadata_by_filter( filters={"field": "meta.status", "operator": "==", "value": "active"} ) assert distinct_counts_active["category"] == 3 # A, B, C @@ -653,7 +653,7 @@ def test_count_distinct_metadata_values_by_filter(self, document_store: OpenSear assert distinct_counts_active["priority"] == 3 # 1, 2, 3 # Count distinct values with complex filter (category="A" AND status="active") - distinct_counts_a_active = document_store.count_distinct_metadata_values_by_filter( + distinct_counts_a_active = document_store.count_unique_metadata_by_filter( filters={ "operator": "AND", "conditions": [ diff --git a/integrations/opensearch/tests/test_document_store_async.py b/integrations/opensearch/tests/test_document_store_async.py index 05df97fe75..6b7f562068 100644 --- a/integrations/opensearch/tests/test_document_store_async.py +++ b/integrations/opensearch/tests/test_document_store_async.py @@ -270,7 +270,7 @@ async def test_count_documents_by_filter(self, document_store: OpenSearchDocumen assert count_a_active == 2 @pytest.mark.asyncio - async def test_count_distinct_values_by_filter(self, document_store: OpenSearchDocumentStore): + async def test_count_unique_metadata_by_filter(self, document_store: OpenSearchDocumentStore): filterable_docs = [ Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1}), Document(content="Doc 2", meta={"category": "B", "status": "active", "priority": 2}), @@ -282,13 +282,13 @@ async def test_count_distinct_values_by_filter(self, document_store: OpenSearchD assert await document_store.count_documents_async() == 5 # count distinct values for all documents - distinct_counts = await document_store.count_distinct_metadata_values_by_filter_async(filters={}) + distinct_counts = await document_store.count_unique_metadata_by_filter_async(filters={}) assert distinct_counts["category"] == 3 # A, B, C assert distinct_counts["status"] == 2 # active, inactive assert distinct_counts["priority"] == 3 # 1, 2, 3 # count distinct values for documents with category="A" - distinct_counts_a = await document_store.count_distinct_metadata_values_by_filter_async( + distinct_counts_a = await document_store.count_unique_metadata_by_filter_async( filters={"field": "meta.category", "operator": "==", "value": "A"} ) assert distinct_counts_a["category"] == 1 # Only A @@ -296,7 +296,7 @@ async def test_count_distinct_values_by_filter(self, document_store: OpenSearchD assert distinct_counts_a["priority"] == 2 # 1, 3 # count distinct values for documents with status="active" - distinct_counts_active = await document_store.count_distinct_metadata_values_by_filter_async( + distinct_counts_active = await document_store.count_unique_metadata_by_filter_async( filters={"field": "meta.status", "operator": "==", "value": "active"} ) assert distinct_counts_active["category"] == 3 # A, B, C @@ -304,7 +304,7 @@ async def test_count_distinct_values_by_filter(self, document_store: OpenSearchD assert distinct_counts_active["priority"] == 3 # 1, 2, 3 # count distinct values with complex filter (category="A" AND status="active") - distinct_counts_a_active = await document_store.count_distinct_metadata_values_by_filter_async( + distinct_counts_a_active = await document_store.count_unique_metadata_by_filter_async( filters={ "operator": "AND", "conditions": [ From 7780e762fe2b4bd470171a64d2f922605c8770a1 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 13:30:24 +0100 Subject: [PATCH 26/58] adding SQLRetriever + tests --- .../retrievers/opensearch/__init__.py | 8 ++++- .../opensearch/document_store.py | 16 ++++++++-- .../opensearch/tests/test_document_store.py | 32 ++++++++----------- .../tests/test_document_store_async.py | 32 ++++++++----------- 4 files changed, 49 insertions(+), 39 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/__init__.py b/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/__init__.py index 7641b6a421..5f80dbd69f 100644 --- a/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/__init__.py +++ b/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/__init__.py @@ -5,5 +5,11 @@ from .bm25_retriever import OpenSearchBM25Retriever from .embedding_retriever import OpenSearchEmbeddingRetriever from .open_search_hybrid_retriever import OpenSearchHybridRetriever +from .sql_retriever import OpenSearchSQLRetriever -__all__ = ["OpenSearchBM25Retriever", "OpenSearchEmbeddingRetriever", "OpenSearchHybridRetriever"] +__all__ = [ + "OpenSearchBM25Retriever", + "OpenSearchEmbeddingRetriever", + "OpenSearchHybridRetriever", + "OpenSearchSQLRetriever", +] diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 4c3b344076..82018225c1 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1529,10 +1529,16 @@ def _process_sql_response(response_data: Any, response_format: ResponseFormat) - else: return response_data if isinstance(response_data, str) else str(response_data) - def query_sql(self, query: str, response_format: ResponseFormat = "json") -> Any: + def _query_sql(self, query: str, response_format: ResponseFormat = "json") -> Any: """ Execute a raw OpenSearch SQL query against the index. + This method is not meant to be part of the public interface of + `OpenSearchDocumentStore` nor called directly. + `OpenSearchSQLRetriever` uses this method directly and is the public interface for it. + + See `OpenSearchSQLRetriever` for more information. + :param query: The OpenSearch SQL query to execute :param response_format: The format of the response. See https://docs.opensearch.org/latest/search-plugins/sql/response-formats/ :returns: The query results in the specified format. For JSON format, returns a list of dictionaries @@ -1593,10 +1599,16 @@ def query_sql(self, query: str, response_format: ResponseFormat = "json") -> Any msg = f"Failed to execute SQL query in OpenSearch: {e!s}" raise DocumentStoreError(msg) from e - async def query_sql_async(self, query: str, response_format: ResponseFormat = "json") -> Any: + async def _query_sql_async(self, query: str, response_format: ResponseFormat = "json") -> Any: """ Asynchronously execute a raw OpenSearch SQL query against the index. + This method is not meant to be part of the public interface of + `OpenSearchDocumentStore` nor called directly. + `OpenSearchSQLRetriever` uses this method directly and is the public interface for it. + + See `OpenSearchSQLRetriever` for more information. + :param query: The OpenSearch SQL query to execute :param response_format: The format of the response. See https://docs.opensearch.org/latest/search-plugins/sql/response-formats/ :returns: The query results in the specified format. For JSON format, returns a list of dictionaries diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index 5a930de613..665e7e4d38 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -796,9 +796,6 @@ def test_get_metadata_field_unique_values(self, document_store: OpenSearchDocume assert priority_count == 1 def test_query_sql(self, document_store: OpenSearchDocumentStore): - """ - Test executing SQL queries against the OpenSearch index. - """ docs = [ Document(content="Python programming", meta={"category": "A", "status": "active", "priority": 1}), Document(content="Java programming", meta={"category": "B", "status": "active", "priority": 2}), @@ -807,54 +804,53 @@ def test_query_sql(self, document_store: OpenSearchDocumentStore): ] document_store.write_documents(docs, refresh=True) - # Test SQL query with JSON format (default) + # SQL query with JSON format (default) sql_query = ( f"SELECT content, category, status, priority FROM {document_store._index} " # noqa: S608 f"WHERE category = 'A' ORDER BY priority" ) - result = document_store.query_sql(sql_query, response_format="json") + result = document_store._query_sql(sql_query, response_format="json") - # New format returns a list of dictionaries (the _source from each hit) + # format returns a list of dictionaries (the _source from each hit) assert len(result) == 2 # Two documents with category A assert isinstance(result, list) assert all(isinstance(row, dict) for row in result) - # Verify data contains expected values categories = [row.get("category") for row in result] assert all(cat == "A" for cat in categories) - # Verify all expected fields are present + # verify all expected fields are present for row in result: assert "content" in row assert "category" in row assert "status" in row assert "priority" in row - # Test SQL query with CSV format - result_csv = document_store.query_sql(sql_query, response_format="csv") + # SQL query with CSV format + result_csv = document_store._query_sql(sql_query, response_format="csv") assert isinstance(result_csv, str) assert "content" in result_csv assert "category" in result_csv - # Test SQL query with JDBC format - result_jdbc = document_store.query_sql(sql_query, response_format="jdbc") + # SQL query with JDBC format + result_jdbc = document_store._query_sql(sql_query, response_format="jdbc") # JDBC format can be dict or str depending on OpenSearch version assert result_jdbc is not None - # Test SQL query with RAW format - result_raw = document_store.query_sql(sql_query, response_format="raw") + # SQL query with RAW format + result_raw = document_store._query_sql(sql_query, response_format="raw") assert isinstance(result_raw, str) - # Test COUNT query + # COUNT query count_query = f"SELECT COUNT(*) as total FROM {document_store._index}" # noqa: S608 - count_result = document_store.query_sql(count_query, response_format="json") + count_result = document_store._query_sql(count_query, response_format="json") # COUNT query may return different format, check it's a valid response assert count_result is not None - # Test error handling for invalid SQL query + # error handling for invalid SQL query invalid_query = "SELECT * FROM non_existent_index" with pytest.raises(DocumentStoreError, match="Failed to execute SQL query"): - document_store.query_sql(invalid_query) + document_store._query_sql(invalid_query) @pytest.mark.integration def test_write_with_routing(self, document_store: OpenSearchDocumentStore): diff --git a/integrations/opensearch/tests/test_document_store_async.py b/integrations/opensearch/tests/test_document_store_async.py index 6b7f562068..05ae4d94e7 100644 --- a/integrations/opensearch/tests/test_document_store_async.py +++ b/integrations/opensearch/tests/test_document_store_async.py @@ -616,9 +616,6 @@ async def test_get_metadata_field_unique_values_async(self, document_store: Open @pytest.mark.asyncio async def test_query_sql(self, document_store: OpenSearchDocumentStore): - """ - Test executing SQL queries against the OpenSearch index. - """ docs = [ Document(content="Python programming", meta={"category": "A", "status": "active", "priority": 1}), Document(content="Java programming", meta={"category": "B", "status": "active", "priority": 2}), @@ -627,51 +624,50 @@ async def test_query_sql(self, document_store: OpenSearchDocumentStore): ] await document_store.write_documents_async(docs, refresh=True) - # Test SQL query with JSON format (default) + # SQL query with JSON format (default) sql_query = ( f"SELECT content, category, status, priority FROM {document_store._index} " # noqa: S608 f"WHERE category = 'A' ORDER BY priority" ) - result = await document_store.query_sql_async(sql_query, response_format="json") + result = await document_store._query_sql_async(sql_query, response_format="json") - # New format returns a list of dictionaries (the _source from each hit) + # returns a list of dictionaries (the _source from each hit) assert len(result) == 2 # Two documents with category A assert isinstance(result, list) assert all(isinstance(row, dict) for row in result) - # Verify data contains expected values categories = [row.get("category") for row in result] assert all(cat == "A" for cat in categories) - # Verify all expected fields are present + # all expected fields are present for row in result: assert "content" in row assert "category" in row assert "status" in row assert "priority" in row - # Test SQL query with CSV format - result_csv = await document_store.query_sql_async(sql_query, response_format="csv") + # SQL query with CSV format + result_csv = await document_store._query_sql_async(sql_query, response_format="csv") assert isinstance(result_csv, str) assert "content" in result_csv assert "category" in result_csv - # Test SQL query with JDBC format - result_jdbc = await document_store.query_sql_async(sql_query, response_format="jdbc") + # SQL query with JDBC format + result_jdbc = await document_store._query_sql_async(sql_query, response_format="jdbc") # JDBC format can be dict or str depending on OpenSearch version assert result_jdbc is not None - # Test SQL query with RAW format - result_raw = await document_store.query_sql_async(sql_query, response_format="raw") + # SQL query with RAW format + result_raw = await document_store._query_sql_async(sql_query, response_format="raw") assert isinstance(result_raw, str) - # Test COUNT query + # COUNT query count_query = f"SELECT COUNT(*) as total FROM {document_store._index}" # noqa: S608 - count_result = await document_store.query_sql_async(count_query, response_format="json") + count_result = await document_store._query_sql_async(count_query, response_format="json") # COUNT query may return different format, check it's a valid response assert count_result is not None - # Test error handling for invalid SQL query + # error handling for invalid SQL query invalid_query = "SELECT * FROM non_existent_index" with pytest.raises(DocumentStoreError, match="Failed to execute SQL query"): - await document_store.query_sql_async(invalid_query) + await document_store._query_sql_async(invalid_query) From 67852d50c735e50d2723f9980c3847be5b14690d Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 13:33:09 +0100 Subject: [PATCH 27/58] adding missing files --- .../opensearch/tests/test_sql_retriever.py | 409 ++++++++++++++++++ 1 file changed, 409 insertions(+) create mode 100644 integrations/opensearch/tests/test_sql_retriever.py diff --git a/integrations/opensearch/tests/test_sql_retriever.py b/integrations/opensearch/tests/test_sql_retriever.py new file mode 100644 index 0000000000..dba0b57e01 --- /dev/null +++ b/integrations/opensearch/tests/test_sql_retriever.py @@ -0,0 +1,409 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import Mock, patch + +import pytest +from haystack.dataclasses import Document + +from haystack_integrations.components.retrievers.opensearch import OpenSearchSQLRetriever +from haystack_integrations.document_stores.opensearch import OpenSearchDocumentStore + + +def test_init_default(): + mock_store = Mock(spec=OpenSearchDocumentStore) + retriever = OpenSearchSQLRetriever(document_store=mock_store) + assert retriever._document_store == mock_store + assert retriever._response_format == "json" + assert retriever._raise_on_failure is True + + +def test_init_custom(): + mock_store = Mock(spec=OpenSearchDocumentStore) + retriever = OpenSearchSQLRetriever(document_store=mock_store, response_format="csv", raise_on_failure=False) + assert retriever._response_format == "csv" + assert retriever._raise_on_failure is False + + +def test_init_invalid_document_store(): + with pytest.raises(ValueError, match="document_store must be an instance of OpenSearchDocumentStore"): + OpenSearchSQLRetriever(document_store="not a document store") + + +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") +def test_to_dict(_mock_opensearch_client): + document_store = OpenSearchDocumentStore(hosts="some fake host") + retriever = OpenSearchSQLRetriever(document_store=document_store, response_format="csv") + res = retriever.to_dict() + assert res["type"] == "haystack_integrations.components.retrievers.opensearch.sql_retriever.OpenSearchSQLRetriever" + assert res["init_parameters"]["response_format"] == "csv" + assert res["init_parameters"]["raise_on_failure"] is True + assert "document_store" in res["init_parameters"] + + +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") +def test_from_dict(_mock_opensearch_client): + document_store = OpenSearchDocumentStore(hosts="some fake host") + retriever = OpenSearchSQLRetriever(document_store=document_store, response_format="csv") + data = retriever.to_dict() + retriever_from_dict = OpenSearchSQLRetriever.from_dict(data) + assert retriever_from_dict._response_format == "csv" + assert retriever_from_dict._raise_on_failure is True + + +def test_run(): + mock_store = Mock(spec=OpenSearchDocumentStore) + mock_store._query_sql.return_value = [{"content": "Test doc", "category": "A"}] + retriever = OpenSearchSQLRetriever(document_store=mock_store) + res = retriever.run(query="SELECT content, category FROM my_index WHERE category = 'A'") + mock_store._query_sql.assert_called_once_with( + query="SELECT content, category FROM my_index WHERE category = 'A'", + response_format="json", + ) + assert len(res) == 1 + assert "result" in res + assert res["result"] == [{"content": "Test doc", "category": "A"}] + + +def test_run_with_custom_response_format(): + mock_store = Mock(spec=OpenSearchDocumentStore) + mock_store._query_sql.return_value = "content,category\nTest doc,A" + retriever = OpenSearchSQLRetriever(document_store=mock_store, response_format="csv") + res = retriever.run(query="SELECT content, category FROM my_index") + mock_store._query_sql.assert_called_once_with(query="SELECT content, category FROM my_index", response_format="csv") + assert res["result"] == "content,category\nTest doc,A" + + +def test_run_with_runtime_response_format(): + mock_store = Mock(spec=OpenSearchDocumentStore) + mock_store._query_sql.return_value = "raw response" + retriever = OpenSearchSQLRetriever(document_store=mock_store, response_format="json") + res = retriever.run(query="SELECT * FROM my_index", response_format="raw") + mock_store._query_sql.assert_called_once_with(query="SELECT * FROM my_index", response_format="raw") + assert res["result"] == "raw response" + + +def test_run_with_runtime_document_store(): + mock_store1 = Mock(spec=OpenSearchDocumentStore) + mock_store2 = Mock(spec=OpenSearchDocumentStore) + mock_store2._query_sql.return_value = [{"result": "from store 2"}] + retriever = OpenSearchSQLRetriever(document_store=mock_store1) + res = retriever.run(query="SELECT * FROM my_index", document_store=mock_store2) + mock_store1._query_sql.assert_not_called() + mock_store2._query_sql.assert_called_once_with(query="SELECT * FROM my_index", response_format="json") + assert res["result"] == [{"result": "from store 2"}] + + +def test_run_with_error_raise_on_failure(): + mock_store = Mock(spec=OpenSearchDocumentStore) + mock_store._query_sql.side_effect = Exception("SQL error") + retriever = OpenSearchSQLRetriever(document_store=mock_store, raise_on_failure=True) + with pytest.raises(Exception, match="SQL error"): + retriever.run(query="SELECT * FROM my_index") + + +def test_run_with_error_no_raise(): + mock_store = Mock(spec=OpenSearchDocumentStore) + mock_store._query_sql.side_effect = Exception("SQL error") + retriever = OpenSearchSQLRetriever(document_store=mock_store, raise_on_failure=False) + res = retriever.run(query="SELECT * FROM my_index") + assert res["result"] is None + + +@pytest.mark.integration +def test_sql_retriever_basic_query(document_store: OpenSearchDocumentStore): + """Test basic SQL query execution with JSON format""" + docs = [ + Document(content="Python programming", meta={"category": "A", "status": "active", "priority": 1}), + Document(content="Java programming", meta={"category": "B", "status": "active", "priority": 2}), + Document(content="Python scripting", meta={"category": "A", "status": "inactive", "priority": 3}), + Document(content="JavaScript development", meta={"category": "C", "status": "active", "priority": 1}), + ] + document_store.write_documents(docs, refresh=True) + + retriever = OpenSearchSQLRetriever(document_store=document_store) + sql_query = ( + f"SELECT content, category, status, priority FROM {document_store._index} " # noqa: S608 + f"WHERE category = 'A' ORDER BY priority" + ) + result = retriever.run(query=sql_query) + + assert "result" in result + assert len(result["result"]) == 2 + assert isinstance(result["result"], list) + assert all(isinstance(row, dict) for row in result["result"]) + + categories = [row.get("category") for row in result["result"]] + assert all(cat == "A" for cat in categories) + + for row in result["result"]: + assert "content" in row + assert "category" in row + assert "status" in row + assert "priority" in row + + +@pytest.mark.integration +def test_sql_retriever_csv_format(document_store: OpenSearchDocumentStore): + """Test SQL query with CSV response format""" + docs = [ + Document(content="Python programming", meta={"category": "A", "status": "active"}), + Document(content="Java programming", meta={"category": "B", "status": "active"}), + ] + document_store.write_documents(docs, refresh=True) + + retriever = OpenSearchSQLRetriever(document_store=document_store, response_format="csv") + sql_query = f"SELECT content, category FROM {document_store._index}" # noqa: S608 + result = retriever.run(query=sql_query) + + assert "result" in result + assert isinstance(result["result"], str) + assert "content" in result["result"] + assert "category" in result["result"] + + +@pytest.mark.integration +def test_sql_retriever_count_query(document_store: OpenSearchDocumentStore): + """Test COUNT query execution""" + docs = [ + Document(content="Doc 1", meta={"category": "A"}), + Document(content="Doc 2", meta={"category": "B"}), + Document(content="Doc 3", meta={"category": "A"}), + ] + document_store.write_documents(docs, refresh=True) + + retriever = OpenSearchSQLRetriever(document_store=document_store) + count_query = f"SELECT COUNT(*) as total FROM {document_store._index}" # noqa: S608 + result = retriever.run(query=count_query) + + assert "result" in result + assert result["result"] is not None + + +@pytest.mark.integration +def test_sql_retriever_with_filters(document_store: OpenSearchDocumentStore): + """Test SQL query with WHERE clause filtering""" + + docs = [ + Document(content="Python programming", meta={"category": "A", "status": "active", "priority": 1}), + Document(content="Java programming", meta={"category": "B", "status": "active", "priority": 2}), + Document(content="Python scripting", meta={"category": "A", "status": "inactive", "priority": 3}), + ] + document_store.write_documents(docs, refresh=True) + + retriever = OpenSearchSQLRetriever(document_store=document_store) + sql_query = ( + f"SELECT content, category, status FROM {document_store._index} " # noqa: S608 + f"WHERE category = 'A' AND status = 'active'" + ) + result = retriever.run(query=sql_query) + + assert "result" in result + assert len(result["result"]) == 1 + assert result["result"][0]["category"] == "A" + assert result["result"][0]["status"] == "active" + + +@pytest.mark.integration +def test_sql_retriever_runtime_response_format(document_store: OpenSearchDocumentStore): + """Test overriding response format at runtime""" + docs = [ + Document(content="Python programming", meta={"category": "A"}), + Document(content="Java programming", meta={"category": "B"}), + ] + document_store.write_documents(docs, refresh=True) + + retriever = OpenSearchSQLRetriever(document_store=document_store, response_format="json") + sql_query = f"SELECT content, category FROM {document_store._index}" # noqa: S608 + + # Override with CSV format at runtime + result = retriever.run(query=sql_query, response_format="csv") + assert isinstance(result["result"], str) + assert "content" in result["result"] + + # Use default JSON format + result_json = retriever.run(query=sql_query) + assert isinstance(result_json["result"], list) + + +@pytest.mark.integration +def test_sql_retriever_runtime_document_store_switching( + document_store: OpenSearchDocumentStore, document_store_2: OpenSearchDocumentStore +): + """Test switching document stores at runtime""" + docs1 = [ + Document(content="Python programming", meta={"category": "A"}), + Document(content="Java programming", meta={"category": "B"}), + ] + document_store.write_documents(docs1, refresh=True) + + docs2 = [ + Document(content="JavaScript development", meta={"category": "C"}), + Document(content="TypeScript development", meta={"category": "D"}), + ] + document_store_2.write_documents(docs2, refresh=True) + + retriever = OpenSearchSQLRetriever(document_store=document_store) + + # Query first store + sql_query1 = f"SELECT content, category FROM {document_store._index} WHERE category = 'A'" # noqa: S608 + result1 = retriever.run(query=sql_query1) + assert len(result1["result"]) == 1 + assert "Python" in result1["result"][0]["content"] + + # Query second store at runtime + sql_query2 = f"SELECT content, category FROM {document_store_2._index} WHERE category = 'C'" # noqa: S608 + result2 = retriever.run(query=sql_query2, document_store=document_store_2) + assert len(result2["result"]) == 1 + assert "JavaScript" in result2["result"][0]["content"] + + # Verify results are different + assert result1["result"][0]["content"] != result2["result"][0]["content"] + + +@pytest.mark.integration +def test_sql_retriever_error_handling(document_store: OpenSearchDocumentStore): + """Test error handling for invalid SQL queries""" + retriever = OpenSearchSQLRetriever(document_store=document_store, raise_on_failure=True) + + invalid_query = "SELECT * FROM non_existent_index" + with pytest.raises(Exception, match="Failed to execute SQL query"): + retriever.run(query=invalid_query) + + # Test with raise_on_failure=False + retriever_no_raise = OpenSearchSQLRetriever(document_store=document_store, raise_on_failure=False) + result = retriever_no_raise.run(query=invalid_query) + assert result["result"] is None + + +@pytest.mark.asyncio +async def test_run_async(): + mock_store = Mock(spec=OpenSearchDocumentStore) + mock_store._query_sql_async.return_value = [{"content": "Test doc", "category": "A"}] + retriever = OpenSearchSQLRetriever(document_store=mock_store) + res = await retriever.run_async(query="SELECT content, category FROM my_index WHERE category = 'A'") + mock_store._query_sql_async.assert_called_once_with( + query="SELECT content, category FROM my_index WHERE category = 'A'", + response_format="json", + ) + assert len(res) == 1 + assert "result" in res + assert res["result"] == [{"content": "Test doc", "category": "A"}] + + +@pytest.mark.asyncio +async def test_run_async_with_error_raise_on_failure(): + mock_store = Mock(spec=OpenSearchDocumentStore) + mock_store._query_sql_async.side_effect = Exception("SQL error") + retriever = OpenSearchSQLRetriever(document_store=mock_store, raise_on_failure=True) + with pytest.raises(Exception, match="SQL error"): + await retriever.run_async(query="SELECT * FROM my_index") + + +@pytest.mark.asyncio +async def test_run_async_with_error_no_raise(): + mock_store = Mock(spec=OpenSearchDocumentStore) + mock_store._query_sql_async.side_effect = Exception("SQL error") + retriever = OpenSearchSQLRetriever(document_store=mock_store, raise_on_failure=False) + res = await retriever.run_async(query="SELECT * FROM my_index") + assert res["result"] is None + + +@pytest.mark.integration +@pytest.mark.asyncio +async def test_sql_retriever_async_basic_query(document_store: OpenSearchDocumentStore): + """Test basic async SQL query execution""" + docs = [ + Document(content="Python programming", meta={"category": "A", "status": "active", "priority": 1}), + Document(content="Java programming", meta={"category": "B", "status": "active", "priority": 2}), + Document(content="Python scripting", meta={"category": "A", "status": "inactive", "priority": 3}), + ] + await document_store.write_documents_async(docs, refresh=True) + + retriever = OpenSearchSQLRetriever(document_store=document_store) + sql_query = ( + f"SELECT content, category, status FROM {document_store._index} " # noqa: S608 + f"WHERE category = 'A' ORDER BY priority" + ) + result = await retriever.run_async(query=sql_query) + + assert "result" in result + assert len(result["result"]) == 2 + assert isinstance(result["result"], list) + assert all(isinstance(row, dict) for row in result["result"]) + + categories = [row.get("category") for row in result["result"]] + assert all(cat == "A" for cat in categories) + + +@pytest.mark.integration +@pytest.mark.asyncio +async def test_sql_retriever_async_csv_format(document_store: OpenSearchDocumentStore): + """Test async SQL query with CSV response format""" + docs = [ + Document(content="Python programming", meta={"category": "A"}), + Document(content="Java programming", meta={"category": "B"}), + ] + await document_store.write_documents_async(docs, refresh=True) + + retriever = OpenSearchSQLRetriever(document_store=document_store, response_format="csv") + sql_query = f"SELECT content, category FROM {document_store._index}" # noqa: S608 + result = await retriever.run_async(query=sql_query) + + assert "result" in result + assert isinstance(result["result"], str) + assert "content" in result["result"] + assert "category" in result["result"] + + +@pytest.mark.integration +@pytest.mark.asyncio +async def test_sql_retriever_async_runtime_document_store_switching( + document_store: OpenSearchDocumentStore, document_store_2: OpenSearchDocumentStore +): + """Test async switching document stores at runtime""" + docs1 = [ + Document(content="Python programming", meta={"category": "A"}), + Document(content="Java programming", meta={"category": "B"}), + ] + await document_store.write_documents_async(docs1, refresh=True) + + docs2 = [ + Document(content="JavaScript development", meta={"category": "C"}), + Document(content="TypeScript development", meta={"category": "D"}), + ] + await document_store_2.write_documents_async(docs2, refresh=True) + + retriever = OpenSearchSQLRetriever(document_store=document_store) + + # Query first store + sql_query1 = f"SELECT content, category FROM {document_store._index} WHERE category = 'A'" # noqa: S608 + result1 = await retriever.run_async(query=sql_query1) + assert len(result1["result"]) == 1 + assert "Python" in result1["result"][0]["content"] + + # Query second store at runtime + sql_query2 = f"SELECT content, category FROM {document_store_2._index} WHERE category = 'C'" # noqa: S608 + result2 = await retriever.run_async(query=sql_query2, document_store=document_store_2) + assert len(result2["result"]) == 1 + assert "JavaScript" in result2["result"][0]["content"] + + # Verify results are different + assert result1["result"][0]["content"] != result2["result"][0]["content"] + + +@pytest.mark.integration +@pytest.mark.asyncio +async def test_sql_retriever_async_error_handling(document_store: OpenSearchDocumentStore): + """Test async error handling for invalid SQL queries""" + retriever = OpenSearchSQLRetriever(document_store=document_store, raise_on_failure=True) + + invalid_query = "SELECT * FROM non_existent_index" + with pytest.raises(Exception, match="Failed to execute SQL query"): + await retriever.run_async(query=invalid_query) + + # Test with raise_on_failure=False + retriever_no_raise = OpenSearchSQLRetriever(document_store=document_store, raise_on_failure=False) + result = await retriever_no_raise.run_async(query=invalid_query) + assert result["result"] is None From 3985984bca54636eb636ea0e49d79f29d1ba1ccf Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 13:33:36 +0100 Subject: [PATCH 28/58] adding missing files --- .../retrievers/opensearch/sql_retriever.py | 190 ++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/sql_retriever.py diff --git a/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/sql_retriever.py b/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/sql_retriever.py new file mode 100644 index 0000000000..921639042a --- /dev/null +++ b/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/sql_retriever.py @@ -0,0 +1,190 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Literal, Optional + +from haystack import component, default_from_dict, default_to_dict, logging + +from haystack_integrations.document_stores.opensearch import OpenSearchDocumentStore + +logger = logging.getLogger(__name__) + +ResponseFormat = Literal["json", "jdbc", "csv", "raw"] + + +@component +class OpenSearchSQLRetriever: + """ + Executes raw OpenSearch SQL queries against an OpenSearchDocumentStore. + + This component allows you to execute SQL queries directly against the OpenSearch index, + which is useful for fetching metadata, aggregations, and other structured data at runtime. + """ + + def __init__( + self, + *, + document_store: OpenSearchDocumentStore, + response_format: ResponseFormat = "json", + raise_on_failure: bool = True, + ): + """ + Creates the OpenSearchSQLRetriever component. + + :param document_store: An instance of OpenSearchDocumentStore to use with the Retriever. + :param response_format: The format of the response. See https://docs.opensearch.org/latest/search-plugins/sql/response-formats/ + - `json`: Returns a list of dictionaries (the _source from each hit). Default. + - `csv`: Returns the response as CSV text. + - `jdbc`: Returns the response in JDBC format. + - `raw`: Returns the raw response as text. + :param raise_on_failure: + Whether to raise an exception if the API call fails. Otherwise, log a warning and return None. + + :raises ValueError: If `document_store` is not an instance of OpenSearchDocumentStore. + """ + if not isinstance(document_store, OpenSearchDocumentStore): + msg = "document_store must be an instance of OpenSearchDocumentStore" + raise ValueError(msg) + + self._document_store = document_store + self._response_format = response_format + self._raise_on_failure = raise_on_failure + + def to_dict(self) -> dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + document_store=self._document_store.to_dict(), + response_format=self._response_format, + raise_on_failure=self._raise_on_failure, + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "OpenSearchSQLRetriever": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + + :returns: + Deserialized component. + """ + data["init_parameters"]["document_store"] = OpenSearchDocumentStore.from_dict( + data["init_parameters"]["document_store"] + ) + return default_from_dict(cls, data) + + @component.output_types(result=Any) + def run( + self, + query: str, + response_format: Optional[ResponseFormat] = None, + document_store: Optional[OpenSearchDocumentStore] = None, + ) -> dict[str, Any]: + """ + Execute a raw OpenSearch SQL query against the index. + + :param query: The OpenSearch SQL query to execute. + :param response_format: The format of the response. If not provided, uses the format + specified during initialization. See https://docs.opensearch.org/latest/search-plugins/sql/response-formats/ + :param document_store: Optionally, an instance of OpenSearchDocumentStore to use with the Retriever. + + :returns: + A dictionary containing the query results with the following structure: + - result: The query results in the specified format. For JSON format, returns a list of dictionaries + (the _source from each hit). For other formats (csv, jdbc, raw), returns the response as text. + + Example: + ```python + retriever = OpenSearchSQLRetriever(document_store=document_store) + result = retriever.run( + query="SELECT content, category FROM my_index WHERE category = 'A'" + ) + # result["result"] contains a list of dictionaries with the query results + ``` + """ + if document_store is not None: + if not isinstance(document_store, OpenSearchDocumentStore): + msg = "document_store must be an instance of OpenSearchDocumentStore" + raise ValueError(msg) + doc_store = document_store + else: + doc_store = self._document_store + + response_format = response_format or self._response_format + + try: + result = doc_store._query_sql(query=query, response_format=response_format) + except Exception as e: + if self._raise_on_failure: + raise e + else: + logger.warning( + "An error during SQL query execution occurred and will be ignored by returning None: {error}", + error=str(e), + exc_info=True, + ) + result = None + + return {"result": result} + + @component.output_types(result=Any) + async def run_async( + self, + query: str, + response_format: Optional[ResponseFormat] = None, + document_store: Optional[OpenSearchDocumentStore] = None, + ) -> dict[str, Any]: + """ + Asynchronously execute a raw OpenSearch SQL query against the index. + + :param query: The OpenSearch SQL query to execute. + :param response_format: The format of the response. If not provided, uses the format + specified during initialization. See https://docs.opensearch.org/latest/search-plugins/sql/response-formats/ + :param document_store: Optionally, an instance of OpenSearchDocumentStore to use with the Retriever. + + :returns: + A dictionary containing the query results with the following structure: + - result: The query results in the specified format. For JSON format, returns a list of dictionaries + (the _source from each hit). For other formats (csv, jdbc, raw), returns the response as text. + + Example: + ```python + retriever = OpenSearchSQLRetriever(document_store=document_store) + result = await retriever.run_async( + query="SELECT content, category FROM my_index WHERE category = 'A'" + ) + # result["result"] contains a list of dictionaries with the query results + ``` + """ + if document_store is not None: + if not isinstance(document_store, OpenSearchDocumentStore): + msg = "document_store must be an instance of OpenSearchDocumentStore" + raise ValueError(msg) + doc_store = document_store + else: + doc_store = self._document_store + + response_format = response_format or self._response_format + + try: + result = await doc_store._query_sql_async(query=query, response_format=response_format) + except Exception as e: + if self._raise_on_failure: + raise e + else: + logger.warning( + "An error during SQL query execution occurred and will be ignored by returning None: {error}", + error=str(e), + exc_info=True, + ) + result = None + + return {"result": result} From ea69e28f0dd816d026f702cf989594d11b604fd2 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 14:21:55 +0100 Subject: [PATCH 29/58] Update integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- .../document_stores/opensearch/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 82018225c1..920b1da0c9 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1228,7 +1228,7 @@ def _extract_distinct_counts_from_aggregations( def count_unique_metadata_by_filter(self, filters: dict) -> dict[str, int]: """ - Returns the number of unique values for each meta field of the documents that match the provided filters. + Returns the number of unique values for each metadata field of the documents that match the provided filters. :param filters: The filters to apply to count documents. For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) From 7caca707852537f95aea9fc2b55b1346edd29988 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 14:22:06 +0100 Subject: [PATCH 30/58] Update integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- .../document_stores/opensearch/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 920b1da0c9..90fc15e06e 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1232,7 +1232,7 @@ def count_unique_metadata_by_filter(self, filters: dict) -> dict[str, int]: :param filters: The filters to apply to count documents. For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) - :returns: The number of unique values for each meta field of the documents that match the filters. + :returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered documents. """ self._ensure_initialized() assert self._client is not None From 0f86e3b12b2a3c7a83e7b36161b643c8c2fc7b9d Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 14:22:30 +0100 Subject: [PATCH 31/58] Update integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- .../document_stores/opensearch/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 90fc15e06e..0fe7ab4626 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1340,7 +1340,7 @@ def _extract_min_max_from_stats(stats: dict[str, Any]) -> dict[str, Any]: max_value = stats.get("max") return {"min": min_value, "max": max_value} - def get_metadata_field_min_max(self, metadata_field: str) -> dict[str, Any]: + def get_metadata_field_min_max(self, metadata_field: str) -> dict[str, int | None]: """ Returns the minimum and maximum values for the given metadata field. From 07785d63cc5241c218e6b53d090379d174c837b7 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 14:22:39 +0100 Subject: [PATCH 32/58] Update integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- .../document_stores/opensearch/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 0fe7ab4626..e95429c33f 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1357,7 +1357,7 @@ def get_metadata_field_min_max(self, metadata_field: str) -> dict[str, int | Non return self._extract_min_max_from_stats(stats) - async def get_metadata_field_min_max_async(self, metadata_field: str) -> dict[str, Any]: + async def get_metadata_field_min_max_async(self, metadata_field: str) -> dict[str, int | None]: """ Asynchronously returns the minimum and maximum values for the given metadata field. From f70438660f8eb6ae73e2c2159419fddb685ec388 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 14:22:48 +0100 Subject: [PATCH 33/58] Update integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- .../document_stores/opensearch/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index e95429c33f..147415e66e 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1362,7 +1362,7 @@ async def get_metadata_field_min_max_async(self, metadata_field: str) -> dict[st Asynchronously returns the minimum and maximum values for the given metadata field. :param metadata_field: The metadata field to get the minimum and maximum values for. - :returns: The minimum and maximum values for the given metadata field. + :returns: A dictionary with the keys "min" and "max", where each value is the minimum or maximum value of the metadata field across all documents. """ await self._ensure_initialized_async() assert self._async_client is not None From b6e00ea89198da04390ee0ae0441a4a8139bd25f Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 14:27:52 +0100 Subject: [PATCH 34/58] Update integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- .../document_stores/opensearch/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 147415e66e..a28324de85 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1253,7 +1253,7 @@ def count_unique_metadata_by_filter(self, filters: dict) -> dict[str, int]: # extract cardinality values from aggregations return self._extract_distinct_counts_from_aggregations(result.get("aggregations", {}), index_mapping) - async def count_unique_metadata_by_filter_async(self, filters: dict) -> dict[str, int]: + async def count_unique_metadata_by_filter_async(self, filters: dict[str, Any]) -> dict[str, int]: """ Asynchronously returns the number of unique values for each meta field of the documents that match the provided filters. From 386130a87f22b4c910f6fb4d269c7f0b04ba9c6e Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 14:28:16 +0100 Subject: [PATCH 35/58] Update integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- .../document_stores/opensearch/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index a28324de85..92f3768dd1 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1226,7 +1226,7 @@ def _extract_distinct_counts_from_aggregations( distinct_counts[field_name] = aggregations[agg_key]["value"] return distinct_counts - def count_unique_metadata_by_filter(self, filters: dict) -> dict[str, int]: + def count_unique_metadata_by_filter(self, filters: dict[str, Any]) -> dict[str, int]: """ Returns the number of unique values for each metadata field of the documents that match the provided filters. From 743a6a5c88dc44dff1a71e317c983fa8fa7669b0 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 14:29:15 +0100 Subject: [PATCH 36/58] Update integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- .../document_stores/opensearch/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 92f3768dd1..faca193b7e 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1255,7 +1255,7 @@ def count_unique_metadata_by_filter(self, filters: dict[str, Any]) -> dict[str, async def count_unique_metadata_by_filter_async(self, filters: dict[str, Any]) -> dict[str, int]: """ - Asynchronously returns the number of unique values for each meta field of the documents that match the + Asynchronously returns the number of unique values for each metadata field of the documents that match the provided filters. :param filters: The filters to apply to count documents. From 54511730fed42693e157429ffe24c085c7a1089b Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 14:29:22 +0100 Subject: [PATCH 37/58] Update integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- .../document_stores/opensearch/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index faca193b7e..88f9379ea3 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1260,7 +1260,7 @@ async def count_unique_metadata_by_filter_async(self, filters: dict[str, Any]) - :param filters: The filters to apply to count documents. For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) - :returns: The number of unique values for each meta field of the documents that match the filters. + :returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered documents. """ await self._ensure_initialized_async() assert self._async_client is not None From 7a95c33b61515558c4d443e2aade2eecec9917f8 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 14:29:35 +0100 Subject: [PATCH 38/58] Update integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- .../document_stores/opensearch/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 88f9379ea3..063a19ce71 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1332,7 +1332,7 @@ def _build_min_max_query_body(field_name: str) -> dict[str, Any]: } @staticmethod - def _extract_min_max_from_stats(stats: dict[str, Any]) -> dict[str, Any]: + def _extract_min_max_from_stats(stats: dict[str, Any]) -> dict[str, int | None]: """ Extracts min and max values from stats aggregation results. """ From 4dc5c02ef90a0ca5e5e70c2e9d4b4fd7746810d1 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 14:29:47 +0100 Subject: [PATCH 39/58] Update integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- .../document_stores/opensearch/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 063a19ce71..6a50e641fd 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1345,7 +1345,7 @@ def get_metadata_field_min_max(self, metadata_field: str) -> dict[str, int | Non Returns the minimum and maximum values for the given metadata field. :param metadata_field: The metadata field to get the minimum and maximum values for. - :returns: The minimum and maximum values for the given metadata field. + :returns: A dictionary with the keys "min" and "max", where each value is the minimum or maximum value of the metadata field across all documents. """ self._ensure_initialized() assert self._client is not None From a9f35d2ca3659f45f55e51fce3761279c30e4335 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 15:16:32 +0100 Subject: [PATCH 40/58] PR comments/fixes --- .../opensearch/document_store.py | 30 ++++++++++++++++--- .../opensearch/tests/test_document_store.py | 6 ---- .../tests/test_document_store_async.py | 7 ----- 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 6a50e641fd..a29c02726c 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1232,7 +1232,8 @@ def count_unique_metadata_by_filter(self, filters: dict[str, Any]) -> dict[str, :param filters: The filters to apply to count documents. For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) - :returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered documents. + :returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered + documents. """ self._ensure_initialized() assert self._client is not None @@ -1260,7 +1261,8 @@ async def count_unique_metadata_by_filter_async(self, filters: dict[str, Any]) - :param filters: The filters to apply to count documents. For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) - :returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered documents. + :returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered + documents. """ await self._ensure_initialized_async() assert self._async_client is not None @@ -1285,6 +1287,20 @@ def get_metadata_fields_info(self) -> dict[str, dict]: """ Returns the information about the fields in the index. + If we populated the index with documents like: + + Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1}) + Document(content="Doc 2", meta={"category": "B", "status": "inactive"}) + + This method would return: + + { + 'content': {'type': 'text'}, + 'category': {'type': 'keyword'}, + 'status': {'type': 'keyword'}, + 'priority': {'type': 'long'}, + } + :returns: The information about the fields in the index. """ self._ensure_initialized() @@ -1292,6 +1308,8 @@ def get_metadata_fields_info(self) -> dict[str, dict]: mapping = self._client.indices.get_mapping(index=self._index) index_mapping = mapping[self._index]["mappings"]["properties"] + # remove all fields that are not metadata fields + index_mapping = {k: v for k, v in index_mapping.items() if k not in SPECIAL_FIELDS} return index_mapping async def get_metadata_fields_info_async(self) -> dict[str, dict]: @@ -1305,6 +1323,8 @@ async def get_metadata_fields_info_async(self) -> dict[str, dict]: mapping = await self._async_client.indices.get_mapping(index=self._index) index_mapping = mapping[self._index]["mappings"]["properties"] + # remove all fields that are not metadata fields + index_mapping = {k: v for k, v in index_mapping.items() if k not in SPECIAL_FIELDS} return index_mapping @staticmethod @@ -1345,7 +1365,8 @@ def get_metadata_field_min_max(self, metadata_field: str) -> dict[str, int | Non Returns the minimum and maximum values for the given metadata field. :param metadata_field: The metadata field to get the minimum and maximum values for. - :returns: A dictionary with the keys "min" and "max", where each value is the minimum or maximum value of the metadata field across all documents. + :returns: A dictionary with the keys "min" and "max", where each value is the minimum or maximum value of the + metadata field across all documents. """ self._ensure_initialized() assert self._client is not None @@ -1362,7 +1383,8 @@ async def get_metadata_field_min_max_async(self, metadata_field: str) -> dict[st Asynchronously returns the minimum and maximum values for the given metadata field. :param metadata_field: The metadata field to get the minimum and maximum values for. - :returns: A dictionary with the keys "min" and "max", where each value is the minimum or maximum value of the metadata field across all documents. + :returns: A dictionary with the keys "min" and "max", where each value is the minimum or maximum value of the + metadata field across all documents. """ await self._ensure_initialized_async() assert self._async_client is not None diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index 665e7e4d38..c67cee5e42 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -676,16 +676,10 @@ def test_get_metadata_fields_info(self, document_store: OpenSearchDocumentStore) fields_info = document_store.get_metadata_fields_info() # Verify that fields_info contains expected fields - assert "content" in fields_info - assert "embedding" in fields_info assert "category" in fields_info assert "status" in fields_info assert "priority" in fields_info - # Verify field types - assert fields_info["content"]["type"] == "text" - assert fields_info["embedding"]["type"] == "knn_vector" - # Metadata fields should be keyword type (from dynamic templates) assert fields_info["category"]["type"] == "keyword" assert fields_info["status"]["type"] == "keyword" assert fields_info["priority"]["type"] == "long" diff --git a/integrations/opensearch/tests/test_document_store_async.py b/integrations/opensearch/tests/test_document_store_async.py index 05ae4d94e7..879a8de33b 100644 --- a/integrations/opensearch/tests/test_document_store_async.py +++ b/integrations/opensearch/tests/test_document_store_async.py @@ -484,17 +484,10 @@ async def test_get_metadata_fields_info_async(self, document_store: OpenSearchDo fields_info = await document_store.get_metadata_fields_info_async() # Verify that fields_info contains expected fields - assert "content" in fields_info - assert "embedding" in fields_info assert "category" in fields_info assert "status" in fields_info assert "priority" in fields_info - # Verify field types - assert fields_info["content"]["type"] == "text" - assert fields_info["embedding"]["type"] == "knn_vector" - - # Metadata fields should be keyword type (from dynamic templates) assert fields_info["category"]["type"] == "keyword" assert fields_info["status"]["type"] == "keyword" assert fields_info["priority"]["type"] == "long" From eb261f96f4f6b5a4389dbeda4d0ca0f6fb30e0d3 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 15:18:45 +0100 Subject: [PATCH 41/58] fixes --- .../components/retrievers/opensearch/sql_retriever.py | 5 ++--- .../document_stores/opensearch/document_store.py | 6 +++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/sql_retriever.py b/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/sql_retriever.py index 921639042a..a7ec5dadb7 100644 --- a/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/sql_retriever.py +++ b/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/sql_retriever.py @@ -2,16 +2,15 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Literal, Optional +from typing import Any, Optional from haystack import component, default_from_dict, default_to_dict, logging from haystack_integrations.document_stores.opensearch import OpenSearchDocumentStore +from haystack_integrations.document_stores.opensearch.document_store import ResponseFormat logger = logging.getLogger(__name__) -ResponseFormat = Literal["json", "jdbc", "csv", "raw"] - @component class OpenSearchSQLRetriever: diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index a29c02726c..ca1b70eef0 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -26,10 +26,10 @@ SPECIAL_FIELDS = {"content", "embedding", "id", "score", "sparse_embedding", "blob"} -Hosts = Union[str, list[Union[str, Mapping[str, Union[str, int]]]]] - ResponseFormat = Literal["json", "jdbc", "csv", "raw"] +Hosts = Union[str, list[Union[str, Mapping[str, Union[str, int]]]]] + # document scores are essentially unbounded and will be scaled to values between 0 and 1 if scale_score is set to # True. Scaling uses the expit function (inverse of the logit function) after applying a scaling factor # (e.g., BM25_SCALING_FACTOR for the bm25_retrieval method). @@ -1233,7 +1233,7 @@ def count_unique_metadata_by_filter(self, filters: dict[str, Any]) -> dict[str, :param filters: The filters to apply to count documents. For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) :returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered - documents. + documents. """ self._ensure_initialized() assert self._client is not None From 99a17dbc470195d9f473cdd290dc1104d0f96d8f Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 15:25:37 +0100 Subject: [PATCH 42/58] improving docstring --- .../document_stores/opensearch/document_store.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index ca1b70eef0..7b62fcd5b2 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1316,6 +1316,20 @@ async def get_metadata_fields_info_async(self) -> dict[str, dict]: """ Asynchronously returns the information about the fields in the index. + If we populated the index with documents like: + + Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1}) + Document(content="Doc 2", meta={"category": "B", "status": "inactive"}) + + This method would return: + + { + 'content': {'type': 'text'}, + 'category': {'type': 'keyword'}, + 'status': {'type': 'keyword'}, + 'priority': {'type': 'long'}, + } + :returns: The information about the fields in the index. """ await self._ensure_initialized_async() From f5eaf4bec5dc370ebe0d2a42aeae993cfa61b0bd Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 15:32:11 +0100 Subject: [PATCH 43/58] updating docs --- integrations/opensearch/pydoc/config_docusaurus.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/integrations/opensearch/pydoc/config_docusaurus.yml b/integrations/opensearch/pydoc/config_docusaurus.yml index 7e711f6f34..d10454000f 100644 --- a/integrations/opensearch/pydoc/config_docusaurus.yml +++ b/integrations/opensearch/pydoc/config_docusaurus.yml @@ -5,6 +5,7 @@ loaders: - haystack_integrations.components.retrievers.opensearch.bm25_retriever - haystack_integrations.components.retrievers.opensearch.embedding_retriever - haystack_integrations.components.retrievers.opensearch.open_search_hybrid_retriever + - haystack_integrations.components.retrievers.opensearch.sql_retriever - haystack_integrations.document_stores.opensearch.document_store - haystack_integrations.document_stores.opensearch.filters search_path: From 1f1b83294b41b5db3120c040fd8fb97c5a224b97 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 15:36:44 +0100 Subject: [PATCH 44/58] removing all SQLRetriever related code --- .../opensearch/document_store.py | 139 ------------------ .../opensearch/tests/test_document_store.py | 57 ------- .../tests/test_document_store_async.py | 58 -------- 3 files changed, 254 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 7b62fcd5b2..5568d72a49 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1564,142 +1564,3 @@ def _process_sql_response(response_data: Any, response_format: ResponseFormat) - return response_data else: return response_data if isinstance(response_data, str) else str(response_data) - - def _query_sql(self, query: str, response_format: ResponseFormat = "json") -> Any: - """ - Execute a raw OpenSearch SQL query against the index. - - This method is not meant to be part of the public interface of - `OpenSearchDocumentStore` nor called directly. - `OpenSearchSQLRetriever` uses this method directly and is the public interface for it. - - See `OpenSearchSQLRetriever` for more information. - - :param query: The OpenSearch SQL query to execute - :param response_format: The format of the response. See https://docs.opensearch.org/latest/search-plugins/sql/response-formats/ - :returns: The query results in the specified format. For JSON format, returns a list of dictionaries - (the _source from each hit). For other formats (csv, jdbc, raw), returns the response as text. - - NOTE: For non-JSON formats (csv, jdbc, raw), use requests to make a raw HTTP request and get the text response - This avoids deserialization issues with the opensearchpy client. - """ - self._ensure_initialized() - assert self._client is not None - - # For non-JSON formats, use requests directly to avoid deserialization issues - if response_format != "json": - try: - # Get connection info from the transport - connection = self._client.transport.get_connection() - base_url = connection.host - url, headers, auth = self._prepare_sql_http_request_params(base_url, response_format) - - verify = self._verify_certs if self._verify_certs is not None else True - timeout = self._timeout if self._timeout is not None else 30.0 - response = requests.post( - url, - json={"query": query}, - headers=headers, - auth=auth, - verify=verify, - timeout=timeout, - ) - response.raise_for_status() - return response.text - except Exception as e: - # If requests fails (e.g., AWS auth), fall back to opensearchpy - # which will raise SerializationError that we can handle - logger.error(f"Failed to execute SQL query in OpenSearch: {e!s}") - - try: - body = {"query": query} - params = {"format": response_format} - - response_data = self._client.transport.perform_request( - method="POST", - url="/_plugins/_sql", - params=params, - body=body, - ) - - return self._process_sql_response(response_data, response_format) - except SerializationError: - # If we get here, it means requests failed above (likely AWS auth) and opensearchpy can't deserialize the - # response. Re-raise as DocumentStoreError with a helpful message - msg = ( - f"Failed to execute SQL query in OpenSearch: Unable to deserialize {response_format} response. " - f"This format may not be supported with the current authentication method." - ) - raise DocumentStoreError(msg) from None - except Exception as e: - msg = f"Failed to execute SQL query in OpenSearch: {e!s}" - raise DocumentStoreError(msg) from e - - async def _query_sql_async(self, query: str, response_format: ResponseFormat = "json") -> Any: - """ - Asynchronously execute a raw OpenSearch SQL query against the index. - - This method is not meant to be part of the public interface of - `OpenSearchDocumentStore` nor called directly. - `OpenSearchSQLRetriever` uses this method directly and is the public interface for it. - - See `OpenSearchSQLRetriever` for more information. - - :param query: The OpenSearch SQL query to execute - :param response_format: The format of the response. See https://docs.opensearch.org/latest/search-plugins/sql/response-formats/ - :returns: The query results in the specified format. For JSON format, returns a list of dictionaries - (the _source from each hit). For other formats (csv, jdbc, raw), returns the response as text. - - NOTE: For non-JSON formats (csv, jdbc, raw), use httpx AsyncClient to make a raw HTTP request and get the text - response. This avoids deserialization issues with the opensearchpy client. - """ - await self._ensure_initialized_async() - assert self._async_client is not None - - # For non-JSON formats, use httpx directly to avoid deserialization issues - if response_format != "json": - try: - # Get connection info from the transport - connection = self._async_client.transport.get_connection() - base_url = connection.host - url, headers, auth = self._prepare_sql_http_request_params(base_url, response_format) - - verify = self._verify_certs if self._verify_certs is not None else True - timeout = httpx.Timeout(self._timeout if self._timeout else 30.0) - - async with httpx.AsyncClient(verify=verify, timeout=timeout) as client: - response = await client.post( - url, - json={"query": query}, - headers=headers, - auth=auth, - ) - response.raise_for_status() - return response.text - except Exception as e: - logger.error(f"Failed to execute SQL query in OpenSearch: {e!s}") - - try: - body = {"query": query} - params = {"format": response_format} - - response_data = await self._async_client.transport.perform_request( - method="POST", - url="/_plugins/_sql", - params=params, - body=body, - ) - - return self._process_sql_response(response_data, response_format) - except SerializationError: - # If we get here, it means httpx failed above (likely AWS auth or not installed) and opensearchpy can't - # deserialize the response. Re-raise as DocumentStoreError with a helpful message - msg = ( - f"Failed to execute SQL query in OpenSearch: Unable to deserialize {response_format} response. " - f"This format may not be supported with the current authentication method. " - f"Consider installing httpx for better support." - ) - raise DocumentStoreError(msg) from None - except Exception as e: - msg = f"Failed to execute SQL query in OpenSearch: {e!s}" - raise DocumentStoreError(msg) from e diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index c67cee5e42..e02d7a1aaf 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -789,63 +789,6 @@ def test_get_metadata_field_unique_values(self, document_store: OpenSearchDocume assert set(unique_priorities_filtered) == {"1"} assert priority_count == 1 - def test_query_sql(self, document_store: OpenSearchDocumentStore): - docs = [ - Document(content="Python programming", meta={"category": "A", "status": "active", "priority": 1}), - Document(content="Java programming", meta={"category": "B", "status": "active", "priority": 2}), - Document(content="Python scripting", meta={"category": "A", "status": "inactive", "priority": 3}), - Document(content="JavaScript development", meta={"category": "C", "status": "active", "priority": 1}), - ] - document_store.write_documents(docs, refresh=True) - - # SQL query with JSON format (default) - sql_query = ( - f"SELECT content, category, status, priority FROM {document_store._index} " # noqa: S608 - f"WHERE category = 'A' ORDER BY priority" - ) - result = document_store._query_sql(sql_query, response_format="json") - - # format returns a list of dictionaries (the _source from each hit) - assert len(result) == 2 # Two documents with category A - assert isinstance(result, list) - assert all(isinstance(row, dict) for row in result) - - categories = [row.get("category") for row in result] - assert all(cat == "A" for cat in categories) - - # verify all expected fields are present - for row in result: - assert "content" in row - assert "category" in row - assert "status" in row - assert "priority" in row - - # SQL query with CSV format - result_csv = document_store._query_sql(sql_query, response_format="csv") - assert isinstance(result_csv, str) - assert "content" in result_csv - assert "category" in result_csv - - # SQL query with JDBC format - result_jdbc = document_store._query_sql(sql_query, response_format="jdbc") - # JDBC format can be dict or str depending on OpenSearch version - assert result_jdbc is not None - - # SQL query with RAW format - result_raw = document_store._query_sql(sql_query, response_format="raw") - assert isinstance(result_raw, str) - - # COUNT query - count_query = f"SELECT COUNT(*) as total FROM {document_store._index}" # noqa: S608 - count_result = document_store._query_sql(count_query, response_format="json") - # COUNT query may return different format, check it's a valid response - assert count_result is not None - - # error handling for invalid SQL query - invalid_query = "SELECT * FROM non_existent_index" - with pytest.raises(DocumentStoreError, match="Failed to execute SQL query"): - document_store._query_sql(invalid_query) - @pytest.mark.integration def test_write_with_routing(self, document_store: OpenSearchDocumentStore): """Test writing documents with routing metadata""" diff --git a/integrations/opensearch/tests/test_document_store_async.py b/integrations/opensearch/tests/test_document_store_async.py index 879a8de33b..b18adfab24 100644 --- a/integrations/opensearch/tests/test_document_store_async.py +++ b/integrations/opensearch/tests/test_document_store_async.py @@ -606,61 +606,3 @@ async def test_get_metadata_field_unique_values_async(self, document_store: Open ) assert set(unique_priorities_filtered) == {"1"} assert priority_count == 1 - - @pytest.mark.asyncio - async def test_query_sql(self, document_store: OpenSearchDocumentStore): - docs = [ - Document(content="Python programming", meta={"category": "A", "status": "active", "priority": 1}), - Document(content="Java programming", meta={"category": "B", "status": "active", "priority": 2}), - Document(content="Python scripting", meta={"category": "A", "status": "inactive", "priority": 3}), - Document(content="JavaScript development", meta={"category": "C", "status": "active", "priority": 1}), - ] - await document_store.write_documents_async(docs, refresh=True) - - # SQL query with JSON format (default) - sql_query = ( - f"SELECT content, category, status, priority FROM {document_store._index} " # noqa: S608 - f"WHERE category = 'A' ORDER BY priority" - ) - result = await document_store._query_sql_async(sql_query, response_format="json") - - # returns a list of dictionaries (the _source from each hit) - assert len(result) == 2 # Two documents with category A - assert isinstance(result, list) - assert all(isinstance(row, dict) for row in result) - - categories = [row.get("category") for row in result] - assert all(cat == "A" for cat in categories) - - # all expected fields are present - for row in result: - assert "content" in row - assert "category" in row - assert "status" in row - assert "priority" in row - - # SQL query with CSV format - result_csv = await document_store._query_sql_async(sql_query, response_format="csv") - assert isinstance(result_csv, str) - assert "content" in result_csv - assert "category" in result_csv - - # SQL query with JDBC format - result_jdbc = await document_store._query_sql_async(sql_query, response_format="jdbc") - # JDBC format can be dict or str depending on OpenSearch version - assert result_jdbc is not None - - # SQL query with RAW format - result_raw = await document_store._query_sql_async(sql_query, response_format="raw") - assert isinstance(result_raw, str) - - # COUNT query - count_query = f"SELECT COUNT(*) as total FROM {document_store._index}" # noqa: S608 - count_result = await document_store._query_sql_async(count_query, response_format="json") - # COUNT query may return different format, check it's a valid response - assert count_result is not None - - # error handling for invalid SQL query - invalid_query = "SELECT * FROM non_existent_index" - with pytest.raises(DocumentStoreError, match="Failed to execute SQL query"): - await document_store._query_sql_async(invalid_query) From 3b1d2d917308f32b81821aa33caa17c67fcc7401 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 15:41:07 +0100 Subject: [PATCH 45/58] removing all SQLRetriever related code --- .../retrievers/opensearch/sql_retriever.py | 189 ------------------ .../opensearch/document_store.py | 3 - .../opensearch/tests/test_document_store.py | 2 +- .../tests/test_document_store_async.py | 1 - 4 files changed, 1 insertion(+), 194 deletions(-) delete mode 100644 integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/sql_retriever.py diff --git a/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/sql_retriever.py b/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/sql_retriever.py deleted file mode 100644 index a7ec5dadb7..0000000000 --- a/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/sql_retriever.py +++ /dev/null @@ -1,189 +0,0 @@ -# SPDX-FileCopyrightText: 2023-present deepset GmbH -# -# SPDX-License-Identifier: Apache-2.0 - -from typing import Any, Optional - -from haystack import component, default_from_dict, default_to_dict, logging - -from haystack_integrations.document_stores.opensearch import OpenSearchDocumentStore -from haystack_integrations.document_stores.opensearch.document_store import ResponseFormat - -logger = logging.getLogger(__name__) - - -@component -class OpenSearchSQLRetriever: - """ - Executes raw OpenSearch SQL queries against an OpenSearchDocumentStore. - - This component allows you to execute SQL queries directly against the OpenSearch index, - which is useful for fetching metadata, aggregations, and other structured data at runtime. - """ - - def __init__( - self, - *, - document_store: OpenSearchDocumentStore, - response_format: ResponseFormat = "json", - raise_on_failure: bool = True, - ): - """ - Creates the OpenSearchSQLRetriever component. - - :param document_store: An instance of OpenSearchDocumentStore to use with the Retriever. - :param response_format: The format of the response. See https://docs.opensearch.org/latest/search-plugins/sql/response-formats/ - - `json`: Returns a list of dictionaries (the _source from each hit). Default. - - `csv`: Returns the response as CSV text. - - `jdbc`: Returns the response in JDBC format. - - `raw`: Returns the raw response as text. - :param raise_on_failure: - Whether to raise an exception if the API call fails. Otherwise, log a warning and return None. - - :raises ValueError: If `document_store` is not an instance of OpenSearchDocumentStore. - """ - if not isinstance(document_store, OpenSearchDocumentStore): - msg = "document_store must be an instance of OpenSearchDocumentStore" - raise ValueError(msg) - - self._document_store = document_store - self._response_format = response_format - self._raise_on_failure = raise_on_failure - - def to_dict(self) -> dict[str, Any]: - """ - Serializes the component to a dictionary. - - :returns: - Dictionary with serialized data. - """ - return default_to_dict( - self, - document_store=self._document_store.to_dict(), - response_format=self._response_format, - raise_on_failure=self._raise_on_failure, - ) - - @classmethod - def from_dict(cls, data: dict[str, Any]) -> "OpenSearchSQLRetriever": - """ - Deserializes the component from a dictionary. - - :param data: - Dictionary to deserialize from. - - :returns: - Deserialized component. - """ - data["init_parameters"]["document_store"] = OpenSearchDocumentStore.from_dict( - data["init_parameters"]["document_store"] - ) - return default_from_dict(cls, data) - - @component.output_types(result=Any) - def run( - self, - query: str, - response_format: Optional[ResponseFormat] = None, - document_store: Optional[OpenSearchDocumentStore] = None, - ) -> dict[str, Any]: - """ - Execute a raw OpenSearch SQL query against the index. - - :param query: The OpenSearch SQL query to execute. - :param response_format: The format of the response. If not provided, uses the format - specified during initialization. See https://docs.opensearch.org/latest/search-plugins/sql/response-formats/ - :param document_store: Optionally, an instance of OpenSearchDocumentStore to use with the Retriever. - - :returns: - A dictionary containing the query results with the following structure: - - result: The query results in the specified format. For JSON format, returns a list of dictionaries - (the _source from each hit). For other formats (csv, jdbc, raw), returns the response as text. - - Example: - ```python - retriever = OpenSearchSQLRetriever(document_store=document_store) - result = retriever.run( - query="SELECT content, category FROM my_index WHERE category = 'A'" - ) - # result["result"] contains a list of dictionaries with the query results - ``` - """ - if document_store is not None: - if not isinstance(document_store, OpenSearchDocumentStore): - msg = "document_store must be an instance of OpenSearchDocumentStore" - raise ValueError(msg) - doc_store = document_store - else: - doc_store = self._document_store - - response_format = response_format or self._response_format - - try: - result = doc_store._query_sql(query=query, response_format=response_format) - except Exception as e: - if self._raise_on_failure: - raise e - else: - logger.warning( - "An error during SQL query execution occurred and will be ignored by returning None: {error}", - error=str(e), - exc_info=True, - ) - result = None - - return {"result": result} - - @component.output_types(result=Any) - async def run_async( - self, - query: str, - response_format: Optional[ResponseFormat] = None, - document_store: Optional[OpenSearchDocumentStore] = None, - ) -> dict[str, Any]: - """ - Asynchronously execute a raw OpenSearch SQL query against the index. - - :param query: The OpenSearch SQL query to execute. - :param response_format: The format of the response. If not provided, uses the format - specified during initialization. See https://docs.opensearch.org/latest/search-plugins/sql/response-formats/ - :param document_store: Optionally, an instance of OpenSearchDocumentStore to use with the Retriever. - - :returns: - A dictionary containing the query results with the following structure: - - result: The query results in the specified format. For JSON format, returns a list of dictionaries - (the _source from each hit). For other formats (csv, jdbc, raw), returns the response as text. - - Example: - ```python - retriever = OpenSearchSQLRetriever(document_store=document_store) - result = await retriever.run_async( - query="SELECT content, category FROM my_index WHERE category = 'A'" - ) - # result["result"] contains a list of dictionaries with the query results - ``` - """ - if document_store is not None: - if not isinstance(document_store, OpenSearchDocumentStore): - msg = "document_store must be an instance of OpenSearchDocumentStore" - raise ValueError(msg) - doc_store = document_store - else: - doc_store = self._document_store - - response_format = response_format or self._response_format - - try: - result = await doc_store._query_sql_async(query=query, response_format=response_format) - except Exception as e: - if self._raise_on_failure: - raise e - else: - logger.warning( - "An error during SQL query execution occurred and will be ignored by returning None: {error}", - error=str(e), - exc_info=True, - ) - result = None - - return {"result": result} diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 5568d72a49..73f8f4428a 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -8,15 +8,12 @@ from math import exp from typing import Any, Literal, Optional, Union -import httpx -import requests from haystack import default_from_dict, default_to_dict, logging from haystack.dataclasses import Document from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy from haystack.utils.auth import Secret from opensearchpy import AsyncHttpConnection, AsyncOpenSearch, OpenSearch -from opensearchpy.exceptions import SerializationError from opensearchpy.helpers import async_bulk, bulk from haystack_integrations.document_stores.opensearch.auth import AsyncAWSAuth, AWSAuth diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index e02d7a1aaf..3648b44a1f 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2023-present deepset GmbH +gi# SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/opensearch/tests/test_document_store_async.py b/integrations/opensearch/tests/test_document_store_async.py index b18adfab24..a1b5271400 100644 --- a/integrations/opensearch/tests/test_document_store_async.py +++ b/integrations/opensearch/tests/test_document_store_async.py @@ -4,7 +4,6 @@ import pytest from haystack.dataclasses import Document -from haystack.document_stores.errors import DocumentStoreError from haystack.document_stores.types import DuplicatePolicy from haystack_integrations.document_stores.opensearch.document_store import OpenSearchDocumentStore From 78019c317ffa47899d850a6441505626ef858071 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 15:44:51 +0100 Subject: [PATCH 46/58] cleaning up typo --- integrations/opensearch/tests/test_document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index 3648b44a1f..e02d7a1aaf 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -1,4 +1,4 @@ -gi# SPDX-FileCopyrightText: 2023-present deepset GmbH +# SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 From 041faa84eeea503323d42e9a7e9d46fded40752d Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 15:47:19 +0100 Subject: [PATCH 47/58] updating init --- .../components/retrievers/opensearch/__init__.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/__init__.py b/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/__init__.py index 5f80dbd69f..7641b6a421 100644 --- a/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/__init__.py +++ b/integrations/opensearch/src/haystack_integrations/components/retrievers/opensearch/__init__.py @@ -5,11 +5,5 @@ from .bm25_retriever import OpenSearchBM25Retriever from .embedding_retriever import OpenSearchEmbeddingRetriever from .open_search_hybrid_retriever import OpenSearchHybridRetriever -from .sql_retriever import OpenSearchSQLRetriever -__all__ = [ - "OpenSearchBM25Retriever", - "OpenSearchEmbeddingRetriever", - "OpenSearchHybridRetriever", - "OpenSearchSQLRetriever", -] +__all__ = ["OpenSearchBM25Retriever", "OpenSearchEmbeddingRetriever", "OpenSearchHybridRetriever"] From 5d45544de9ccaa0da950bd939606994513f1db4b Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 15:49:42 +0100 Subject: [PATCH 48/58] reverting docs updated --- integrations/opensearch/pydoc/config_docusaurus.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/integrations/opensearch/pydoc/config_docusaurus.yml b/integrations/opensearch/pydoc/config_docusaurus.yml index d10454000f..7e711f6f34 100644 --- a/integrations/opensearch/pydoc/config_docusaurus.yml +++ b/integrations/opensearch/pydoc/config_docusaurus.yml @@ -5,7 +5,6 @@ loaders: - haystack_integrations.components.retrievers.opensearch.bm25_retriever - haystack_integrations.components.retrievers.opensearch.embedding_retriever - haystack_integrations.components.retrievers.opensearch.open_search_hybrid_retriever - - haystack_integrations.components.retrievers.opensearch.sql_retriever - haystack_integrations.document_stores.opensearch.document_store - haystack_integrations.document_stores.opensearch.filters search_path: From 131e3b5f878288db0ecced63c993ef05bd837d24 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 13 Jan 2026 15:54:01 +0100 Subject: [PATCH 49/58] removing tests for SQLRetrieve --- .../opensearch/tests/test_sql_retriever.py | 409 ------------------ 1 file changed, 409 deletions(-) delete mode 100644 integrations/opensearch/tests/test_sql_retriever.py diff --git a/integrations/opensearch/tests/test_sql_retriever.py b/integrations/opensearch/tests/test_sql_retriever.py deleted file mode 100644 index dba0b57e01..0000000000 --- a/integrations/opensearch/tests/test_sql_retriever.py +++ /dev/null @@ -1,409 +0,0 @@ -# SPDX-FileCopyrightText: 2023-present deepset GmbH -# -# SPDX-License-Identifier: Apache-2.0 - -from unittest.mock import Mock, patch - -import pytest -from haystack.dataclasses import Document - -from haystack_integrations.components.retrievers.opensearch import OpenSearchSQLRetriever -from haystack_integrations.document_stores.opensearch import OpenSearchDocumentStore - - -def test_init_default(): - mock_store = Mock(spec=OpenSearchDocumentStore) - retriever = OpenSearchSQLRetriever(document_store=mock_store) - assert retriever._document_store == mock_store - assert retriever._response_format == "json" - assert retriever._raise_on_failure is True - - -def test_init_custom(): - mock_store = Mock(spec=OpenSearchDocumentStore) - retriever = OpenSearchSQLRetriever(document_store=mock_store, response_format="csv", raise_on_failure=False) - assert retriever._response_format == "csv" - assert retriever._raise_on_failure is False - - -def test_init_invalid_document_store(): - with pytest.raises(ValueError, match="document_store must be an instance of OpenSearchDocumentStore"): - OpenSearchSQLRetriever(document_store="not a document store") - - -@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") -def test_to_dict(_mock_opensearch_client): - document_store = OpenSearchDocumentStore(hosts="some fake host") - retriever = OpenSearchSQLRetriever(document_store=document_store, response_format="csv") - res = retriever.to_dict() - assert res["type"] == "haystack_integrations.components.retrievers.opensearch.sql_retriever.OpenSearchSQLRetriever" - assert res["init_parameters"]["response_format"] == "csv" - assert res["init_parameters"]["raise_on_failure"] is True - assert "document_store" in res["init_parameters"] - - -@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") -def test_from_dict(_mock_opensearch_client): - document_store = OpenSearchDocumentStore(hosts="some fake host") - retriever = OpenSearchSQLRetriever(document_store=document_store, response_format="csv") - data = retriever.to_dict() - retriever_from_dict = OpenSearchSQLRetriever.from_dict(data) - assert retriever_from_dict._response_format == "csv" - assert retriever_from_dict._raise_on_failure is True - - -def test_run(): - mock_store = Mock(spec=OpenSearchDocumentStore) - mock_store._query_sql.return_value = [{"content": "Test doc", "category": "A"}] - retriever = OpenSearchSQLRetriever(document_store=mock_store) - res = retriever.run(query="SELECT content, category FROM my_index WHERE category = 'A'") - mock_store._query_sql.assert_called_once_with( - query="SELECT content, category FROM my_index WHERE category = 'A'", - response_format="json", - ) - assert len(res) == 1 - assert "result" in res - assert res["result"] == [{"content": "Test doc", "category": "A"}] - - -def test_run_with_custom_response_format(): - mock_store = Mock(spec=OpenSearchDocumentStore) - mock_store._query_sql.return_value = "content,category\nTest doc,A" - retriever = OpenSearchSQLRetriever(document_store=mock_store, response_format="csv") - res = retriever.run(query="SELECT content, category FROM my_index") - mock_store._query_sql.assert_called_once_with(query="SELECT content, category FROM my_index", response_format="csv") - assert res["result"] == "content,category\nTest doc,A" - - -def test_run_with_runtime_response_format(): - mock_store = Mock(spec=OpenSearchDocumentStore) - mock_store._query_sql.return_value = "raw response" - retriever = OpenSearchSQLRetriever(document_store=mock_store, response_format="json") - res = retriever.run(query="SELECT * FROM my_index", response_format="raw") - mock_store._query_sql.assert_called_once_with(query="SELECT * FROM my_index", response_format="raw") - assert res["result"] == "raw response" - - -def test_run_with_runtime_document_store(): - mock_store1 = Mock(spec=OpenSearchDocumentStore) - mock_store2 = Mock(spec=OpenSearchDocumentStore) - mock_store2._query_sql.return_value = [{"result": "from store 2"}] - retriever = OpenSearchSQLRetriever(document_store=mock_store1) - res = retriever.run(query="SELECT * FROM my_index", document_store=mock_store2) - mock_store1._query_sql.assert_not_called() - mock_store2._query_sql.assert_called_once_with(query="SELECT * FROM my_index", response_format="json") - assert res["result"] == [{"result": "from store 2"}] - - -def test_run_with_error_raise_on_failure(): - mock_store = Mock(spec=OpenSearchDocumentStore) - mock_store._query_sql.side_effect = Exception("SQL error") - retriever = OpenSearchSQLRetriever(document_store=mock_store, raise_on_failure=True) - with pytest.raises(Exception, match="SQL error"): - retriever.run(query="SELECT * FROM my_index") - - -def test_run_with_error_no_raise(): - mock_store = Mock(spec=OpenSearchDocumentStore) - mock_store._query_sql.side_effect = Exception("SQL error") - retriever = OpenSearchSQLRetriever(document_store=mock_store, raise_on_failure=False) - res = retriever.run(query="SELECT * FROM my_index") - assert res["result"] is None - - -@pytest.mark.integration -def test_sql_retriever_basic_query(document_store: OpenSearchDocumentStore): - """Test basic SQL query execution with JSON format""" - docs = [ - Document(content="Python programming", meta={"category": "A", "status": "active", "priority": 1}), - Document(content="Java programming", meta={"category": "B", "status": "active", "priority": 2}), - Document(content="Python scripting", meta={"category": "A", "status": "inactive", "priority": 3}), - Document(content="JavaScript development", meta={"category": "C", "status": "active", "priority": 1}), - ] - document_store.write_documents(docs, refresh=True) - - retriever = OpenSearchSQLRetriever(document_store=document_store) - sql_query = ( - f"SELECT content, category, status, priority FROM {document_store._index} " # noqa: S608 - f"WHERE category = 'A' ORDER BY priority" - ) - result = retriever.run(query=sql_query) - - assert "result" in result - assert len(result["result"]) == 2 - assert isinstance(result["result"], list) - assert all(isinstance(row, dict) for row in result["result"]) - - categories = [row.get("category") for row in result["result"]] - assert all(cat == "A" for cat in categories) - - for row in result["result"]: - assert "content" in row - assert "category" in row - assert "status" in row - assert "priority" in row - - -@pytest.mark.integration -def test_sql_retriever_csv_format(document_store: OpenSearchDocumentStore): - """Test SQL query with CSV response format""" - docs = [ - Document(content="Python programming", meta={"category": "A", "status": "active"}), - Document(content="Java programming", meta={"category": "B", "status": "active"}), - ] - document_store.write_documents(docs, refresh=True) - - retriever = OpenSearchSQLRetriever(document_store=document_store, response_format="csv") - sql_query = f"SELECT content, category FROM {document_store._index}" # noqa: S608 - result = retriever.run(query=sql_query) - - assert "result" in result - assert isinstance(result["result"], str) - assert "content" in result["result"] - assert "category" in result["result"] - - -@pytest.mark.integration -def test_sql_retriever_count_query(document_store: OpenSearchDocumentStore): - """Test COUNT query execution""" - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - Document(content="Doc 3", meta={"category": "A"}), - ] - document_store.write_documents(docs, refresh=True) - - retriever = OpenSearchSQLRetriever(document_store=document_store) - count_query = f"SELECT COUNT(*) as total FROM {document_store._index}" # noqa: S608 - result = retriever.run(query=count_query) - - assert "result" in result - assert result["result"] is not None - - -@pytest.mark.integration -def test_sql_retriever_with_filters(document_store: OpenSearchDocumentStore): - """Test SQL query with WHERE clause filtering""" - - docs = [ - Document(content="Python programming", meta={"category": "A", "status": "active", "priority": 1}), - Document(content="Java programming", meta={"category": "B", "status": "active", "priority": 2}), - Document(content="Python scripting", meta={"category": "A", "status": "inactive", "priority": 3}), - ] - document_store.write_documents(docs, refresh=True) - - retriever = OpenSearchSQLRetriever(document_store=document_store) - sql_query = ( - f"SELECT content, category, status FROM {document_store._index} " # noqa: S608 - f"WHERE category = 'A' AND status = 'active'" - ) - result = retriever.run(query=sql_query) - - assert "result" in result - assert len(result["result"]) == 1 - assert result["result"][0]["category"] == "A" - assert result["result"][0]["status"] == "active" - - -@pytest.mark.integration -def test_sql_retriever_runtime_response_format(document_store: OpenSearchDocumentStore): - """Test overriding response format at runtime""" - docs = [ - Document(content="Python programming", meta={"category": "A"}), - Document(content="Java programming", meta={"category": "B"}), - ] - document_store.write_documents(docs, refresh=True) - - retriever = OpenSearchSQLRetriever(document_store=document_store, response_format="json") - sql_query = f"SELECT content, category FROM {document_store._index}" # noqa: S608 - - # Override with CSV format at runtime - result = retriever.run(query=sql_query, response_format="csv") - assert isinstance(result["result"], str) - assert "content" in result["result"] - - # Use default JSON format - result_json = retriever.run(query=sql_query) - assert isinstance(result_json["result"], list) - - -@pytest.mark.integration -def test_sql_retriever_runtime_document_store_switching( - document_store: OpenSearchDocumentStore, document_store_2: OpenSearchDocumentStore -): - """Test switching document stores at runtime""" - docs1 = [ - Document(content="Python programming", meta={"category": "A"}), - Document(content="Java programming", meta={"category": "B"}), - ] - document_store.write_documents(docs1, refresh=True) - - docs2 = [ - Document(content="JavaScript development", meta={"category": "C"}), - Document(content="TypeScript development", meta={"category": "D"}), - ] - document_store_2.write_documents(docs2, refresh=True) - - retriever = OpenSearchSQLRetriever(document_store=document_store) - - # Query first store - sql_query1 = f"SELECT content, category FROM {document_store._index} WHERE category = 'A'" # noqa: S608 - result1 = retriever.run(query=sql_query1) - assert len(result1["result"]) == 1 - assert "Python" in result1["result"][0]["content"] - - # Query second store at runtime - sql_query2 = f"SELECT content, category FROM {document_store_2._index} WHERE category = 'C'" # noqa: S608 - result2 = retriever.run(query=sql_query2, document_store=document_store_2) - assert len(result2["result"]) == 1 - assert "JavaScript" in result2["result"][0]["content"] - - # Verify results are different - assert result1["result"][0]["content"] != result2["result"][0]["content"] - - -@pytest.mark.integration -def test_sql_retriever_error_handling(document_store: OpenSearchDocumentStore): - """Test error handling for invalid SQL queries""" - retriever = OpenSearchSQLRetriever(document_store=document_store, raise_on_failure=True) - - invalid_query = "SELECT * FROM non_existent_index" - with pytest.raises(Exception, match="Failed to execute SQL query"): - retriever.run(query=invalid_query) - - # Test with raise_on_failure=False - retriever_no_raise = OpenSearchSQLRetriever(document_store=document_store, raise_on_failure=False) - result = retriever_no_raise.run(query=invalid_query) - assert result["result"] is None - - -@pytest.mark.asyncio -async def test_run_async(): - mock_store = Mock(spec=OpenSearchDocumentStore) - mock_store._query_sql_async.return_value = [{"content": "Test doc", "category": "A"}] - retriever = OpenSearchSQLRetriever(document_store=mock_store) - res = await retriever.run_async(query="SELECT content, category FROM my_index WHERE category = 'A'") - mock_store._query_sql_async.assert_called_once_with( - query="SELECT content, category FROM my_index WHERE category = 'A'", - response_format="json", - ) - assert len(res) == 1 - assert "result" in res - assert res["result"] == [{"content": "Test doc", "category": "A"}] - - -@pytest.mark.asyncio -async def test_run_async_with_error_raise_on_failure(): - mock_store = Mock(spec=OpenSearchDocumentStore) - mock_store._query_sql_async.side_effect = Exception("SQL error") - retriever = OpenSearchSQLRetriever(document_store=mock_store, raise_on_failure=True) - with pytest.raises(Exception, match="SQL error"): - await retriever.run_async(query="SELECT * FROM my_index") - - -@pytest.mark.asyncio -async def test_run_async_with_error_no_raise(): - mock_store = Mock(spec=OpenSearchDocumentStore) - mock_store._query_sql_async.side_effect = Exception("SQL error") - retriever = OpenSearchSQLRetriever(document_store=mock_store, raise_on_failure=False) - res = await retriever.run_async(query="SELECT * FROM my_index") - assert res["result"] is None - - -@pytest.mark.integration -@pytest.mark.asyncio -async def test_sql_retriever_async_basic_query(document_store: OpenSearchDocumentStore): - """Test basic async SQL query execution""" - docs = [ - Document(content="Python programming", meta={"category": "A", "status": "active", "priority": 1}), - Document(content="Java programming", meta={"category": "B", "status": "active", "priority": 2}), - Document(content="Python scripting", meta={"category": "A", "status": "inactive", "priority": 3}), - ] - await document_store.write_documents_async(docs, refresh=True) - - retriever = OpenSearchSQLRetriever(document_store=document_store) - sql_query = ( - f"SELECT content, category, status FROM {document_store._index} " # noqa: S608 - f"WHERE category = 'A' ORDER BY priority" - ) - result = await retriever.run_async(query=sql_query) - - assert "result" in result - assert len(result["result"]) == 2 - assert isinstance(result["result"], list) - assert all(isinstance(row, dict) for row in result["result"]) - - categories = [row.get("category") for row in result["result"]] - assert all(cat == "A" for cat in categories) - - -@pytest.mark.integration -@pytest.mark.asyncio -async def test_sql_retriever_async_csv_format(document_store: OpenSearchDocumentStore): - """Test async SQL query with CSV response format""" - docs = [ - Document(content="Python programming", meta={"category": "A"}), - Document(content="Java programming", meta={"category": "B"}), - ] - await document_store.write_documents_async(docs, refresh=True) - - retriever = OpenSearchSQLRetriever(document_store=document_store, response_format="csv") - sql_query = f"SELECT content, category FROM {document_store._index}" # noqa: S608 - result = await retriever.run_async(query=sql_query) - - assert "result" in result - assert isinstance(result["result"], str) - assert "content" in result["result"] - assert "category" in result["result"] - - -@pytest.mark.integration -@pytest.mark.asyncio -async def test_sql_retriever_async_runtime_document_store_switching( - document_store: OpenSearchDocumentStore, document_store_2: OpenSearchDocumentStore -): - """Test async switching document stores at runtime""" - docs1 = [ - Document(content="Python programming", meta={"category": "A"}), - Document(content="Java programming", meta={"category": "B"}), - ] - await document_store.write_documents_async(docs1, refresh=True) - - docs2 = [ - Document(content="JavaScript development", meta={"category": "C"}), - Document(content="TypeScript development", meta={"category": "D"}), - ] - await document_store_2.write_documents_async(docs2, refresh=True) - - retriever = OpenSearchSQLRetriever(document_store=document_store) - - # Query first store - sql_query1 = f"SELECT content, category FROM {document_store._index} WHERE category = 'A'" # noqa: S608 - result1 = await retriever.run_async(query=sql_query1) - assert len(result1["result"]) == 1 - assert "Python" in result1["result"][0]["content"] - - # Query second store at runtime - sql_query2 = f"SELECT content, category FROM {document_store_2._index} WHERE category = 'C'" # noqa: S608 - result2 = await retriever.run_async(query=sql_query2, document_store=document_store_2) - assert len(result2["result"]) == 1 - assert "JavaScript" in result2["result"][0]["content"] - - # Verify results are different - assert result1["result"][0]["content"] != result2["result"][0]["content"] - - -@pytest.mark.integration -@pytest.mark.asyncio -async def test_sql_retriever_async_error_handling(document_store: OpenSearchDocumentStore): - """Test async error handling for invalid SQL queries""" - retriever = OpenSearchSQLRetriever(document_store=document_store, raise_on_failure=True) - - invalid_query = "SELECT * FROM non_existent_index" - with pytest.raises(Exception, match="Failed to execute SQL query"): - await retriever.run_async(query=invalid_query) - - # Test with raise_on_failure=False - retriever_no_raise = OpenSearchSQLRetriever(document_store=document_store, raise_on_failure=False) - result = await retriever_no_raise.run_async(query=invalid_query) - assert result["result"] is None From a37108c0c1f1f3d3349ebb7f08eb3b57c65e9a51 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 14 Jan 2026 10:36:46 +0100 Subject: [PATCH 50/58] fixing after sync with main --- .../document_stores/opensearch/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 9f58bf18a3..e7d79d5ceb 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1189,7 +1189,7 @@ def _build_cardinality_aggregations(index_mapping: dict[str, Any]) -> dict[str, return aggs @staticmethod - def _build_distinct_values_query_body(filters: Optional[dict[str, Any]], aggs: dict[str, Any]) -> dict[str, Any]: + def _build_distinct_values_query_body(filters: dict[str, Any] | None, aggs: dict[str, Any]) -> dict[str, Any]: """ Builds the query body for distinct values counting with filters and aggregations. """ From ad13c32e5e5e6858d2a512940b5a799c65ab8ec5 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 14 Jan 2026 10:46:39 +0100 Subject: [PATCH 51/58] Update integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- .../document_stores/opensearch/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index e7d79d5ceb..61efc7029b 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1309,7 +1309,7 @@ def get_metadata_fields_info(self) -> dict[str, dict]: index_mapping = {k: v for k, v in index_mapping.items() if k not in SPECIAL_FIELDS} return index_mapping - async def get_metadata_fields_info_async(self) -> dict[str, dict]: + async def get_metadata_fields_info_async(self) -> dict[str, dict[str, str]]: """ Asynchronously returns the information about the fields in the index. From e9ccd9e5fd7527f54f4255b68c8b0290997af682 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 14 Jan 2026 10:46:50 +0100 Subject: [PATCH 52/58] Update integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py Co-authored-by: Sebastian Husch Lee <10526848+sjrl@users.noreply.github.com> --- .../document_stores/opensearch/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 61efc7029b..b84e53ac69 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1280,7 +1280,7 @@ async def count_unique_metadata_by_filter_async(self, filters: dict[str, Any]) - # extract cardinality values from aggregations return self._extract_distinct_counts_from_aggregations(result.get("aggregations", {}), index_mapping) - def get_metadata_fields_info(self) -> dict[str, dict]: + def get_metadata_fields_info(self) -> dict[str, dict[str, str]]: """ Returns the information about the fields in the index. From e5b90f2e481a07936d86b0aeb96f5956c7bdcd3b Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 14 Jan 2026 10:43:32 +0100 Subject: [PATCH 53/58] removing SQLResponse/Retriever related code --- .../opensearch/document_store.py | 35 +------------------ 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index b84e53ac69..6370f8a3ab 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1527,37 +1527,4 @@ async def get_metadata_field_unique_values_async( # Extract total count from cardinality aggregation total_count = int(aggregations.get("total_count", {}).get("value", 0)) - return unique_values, total_count - - def _prepare_sql_http_request_params( - self, base_url: str, response_format: ResponseFormat - ) -> tuple[str, dict[str, str], Any]: - """ - Prepares HTTP request parameters for SQL query execution. - """ - url = f"{base_url}/_plugins/_sql?format={response_format}" - headers = {"Content-Type": "application/json"} - auth = None - if self._http_auth: - if isinstance(self._http_auth, tuple): - auth = self._http_auth - elif isinstance(self._http_auth, AWSAuth): - # For AWS auth, we need to use the opensearchpy client - # Fall through to the try/except below - pass - return url, headers, auth - - @staticmethod - def _process_sql_response(response_data: Any, response_format: ResponseFormat) -> Any: - """ - Processes the SQL query response data. - """ - if response_format == "json": - # extract only the query results - if isinstance(response_data, dict) and "hits" in response_data: - hits = response_data.get("hits", {}).get("hits", []) - # extract _source from each hit, which contains the actual document data - return [hit.get("_source", {}) for hit in hits] - return response_data - else: - return response_data if isinstance(response_data, str) else str(response_data) + return unique_values, total_count \ No newline at end of file From 87eaeebf8ced67478d4061ace9a1659bf45ea791 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 14 Jan 2026 10:47:29 +0100 Subject: [PATCH 54/58] new line at end of file --- .../document_stores/opensearch/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 6370f8a3ab..de6370281c 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1527,4 +1527,4 @@ async def get_metadata_field_unique_values_async( # Extract total count from cardinality aggregation total_count = int(aggregations.get("total_count", {}).get("value", 0)) - return unique_values, total_count \ No newline at end of file + return unique_values, total_count From 82f0fc24e1b6610b05c575a701403250073f3b4e Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 15 Jan 2026 13:12:22 +0100 Subject: [PATCH 55/58] updating return value on get_metadata_field_unique + count_unique_metadata_by_filter --- .../opensearch/document_store.py | 86 +++++++++++++------ .../opensearch/tests/test_document_store.py | 65 ++++++++------ .../tests/test_document_store_async.py | 69 ++++++++------- 3 files changed, 132 insertions(+), 88 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index de6370281c..ee1e3d8cc4 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1176,15 +1176,19 @@ async def count_documents_by_filter_async(self, filters: dict[str, Any]) -> int: return (await self._async_client.count(index=self._index, body=body))["count"] @staticmethod - def _build_cardinality_aggregations(index_mapping: dict[str, Any]) -> dict[str, Any]: + def _build_cardinality_aggregations(index_mapping: dict[str, Any], fields: list[str]) -> dict[str, Any]: """ - Builds cardinality aggregations for all metadata fields in the index mapping. + Builds cardinality aggregations for specified metadata fields in the index mapping. + + :param index_mapping: The index mapping containing field definitions. + :param fields: List of field names to build aggregations for. + :returns: Dictionary of cardinality aggregations. See: https://docs.opensearch.org/latest/aggregations/metric/cardinality/ """ aggs = {} - for field_name in index_mapping.keys(): - if field_name not in SPECIAL_FIELDS: + for field_name in fields: + if field_name not in SPECIAL_FIELDS and field_name in index_mapping: aggs[f"{field_name}_cardinality"] = {"cardinality": {"field": field_name}} return aggs @@ -1210,27 +1214,36 @@ def _build_distinct_values_query_body(filters: dict[str, Any] | None, aggs: dict @staticmethod def _extract_distinct_counts_from_aggregations( - aggregations: dict[str, Any], index_mapping: dict[str, Any] + aggregations: dict[str, Any], index_mapping: dict[str, Any], fields: list[str] ) -> dict[str, int]: """ Extracts distinct value counts from search result aggregations. + + :param aggregations: The aggregations result from the search query. + :param index_mapping: The index mapping containing field definitions. + :param fields: List of field names to extract counts for. + :returns: Dictionary mapping field names to their distinct value counts. """ distinct_counts = {} - for field_name in index_mapping.keys(): - if field_name not in SPECIAL_FIELDS: + for field_name in fields: + if field_name not in SPECIAL_FIELDS and field_name in index_mapping: agg_key = f"{field_name}_cardinality" if agg_key in aggregations: distinct_counts[field_name] = aggregations[agg_key]["value"] return distinct_counts - def count_unique_metadata_by_filter(self, filters: dict[str, Any]) -> dict[str, int]: + def count_unique_metadata_by_filter(self, filters: dict[str, Any], fields: list[str]) -> dict[str, int]: """ - Returns the number of unique values for each metadata field of the documents that match the provided filters. + Returns the number of unique values for each specified metadata field of the documents + that match the provided filters. :param filters: The filters to apply to count documents. For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) + :param fields: List of field names to calculate unique values for. + Field names can include or omit the "meta." prefix. :returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered documents. + :raises ValueError: If any of the requested fields don't exist in the index mapping. """ self._ensure_initialized() assert self._client is not None @@ -1239,8 +1252,16 @@ def count_unique_metadata_by_filter(self, filters: dict[str, Any]) -> dict[str, mapping = self._client.indices.get_mapping(index=self._index) index_mapping = mapping[self._index]["mappings"]["properties"] - # build aggregations for each metadata field - aggs = self._build_cardinality_aggregations(index_mapping) + # normalize field names + normalized_fields = [self._normalize_metadata_field_name(field) for field in fields] + # validate that all requested fields exist in the index mapping + missing_fields = [f for f in normalized_fields if f not in index_mapping] + if missing_fields: + msg = f"Fields not found in index mapping: {missing_fields}" + raise ValueError(msg) + + # build aggregations for specified metadata fields + aggs = self._build_cardinality_aggregations(index_mapping, normalized_fields) if not aggs: return {} @@ -1249,17 +1270,22 @@ def count_unique_metadata_by_filter(self, filters: dict[str, Any]) -> dict[str, result = self._client.search(index=self._index, body=body) # extract cardinality values from aggregations - return self._extract_distinct_counts_from_aggregations(result.get("aggregations", {}), index_mapping) + return self._extract_distinct_counts_from_aggregations( + result.get("aggregations", {}), index_mapping, normalized_fields + ) - async def count_unique_metadata_by_filter_async(self, filters: dict[str, Any]) -> dict[str, int]: + async def count_unique_metadata_by_filter_async(self, filters: dict[str, Any], fields: list[str]) -> dict[str, int]: """ - Asynchronously returns the number of unique values for each metadata field of the documents that match the - provided filters. + Asynchronously returns the number of unique values for each specified metadata field of the documents + that match the provided filters. :param filters: The filters to apply to count documents. For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) + :param fields: List of field names to calculate unique values for. + Field names can include or omit the "meta." prefix. :returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered documents. + :raises ValueError: If any of the requested fields don't exist in the index mapping. """ await self._ensure_initialized_async() assert self._async_client is not None @@ -1268,8 +1294,16 @@ async def count_unique_metadata_by_filter_async(self, filters: dict[str, Any]) - mapping = await self._async_client.indices.get_mapping(index=self._index) index_mapping = mapping[self._index]["mappings"]["properties"] - # build aggregations for each metadata field - aggs = self._build_cardinality_aggregations(index_mapping) + # normalize field names + normalized_fields = [self._normalize_metadata_field_name(field) for field in fields] + # validate that all requested fields exist in the index mapping + missing_fields = [f for f in normalized_fields if f not in index_mapping] + if missing_fields: + msg = f"Fields not found in index mapping: {missing_fields}" + raise ValueError(msg) + + # build aggregations for specified metadata fields + aggs = self._build_cardinality_aggregations(index_mapping, normalized_fields) if not aggs: return {} @@ -1278,7 +1312,9 @@ async def count_unique_metadata_by_filter_async(self, filters: dict[str, Any]) - result = await self._async_client.search(index=self._index, body=body) # extract cardinality values from aggregations - return self._extract_distinct_counts_from_aggregations(result.get("aggregations", {}), index_mapping) + return self._extract_distinct_counts_from_aggregations( + result.get("aggregations", {}), index_mapping, normalized_fields + ) def get_metadata_fields_info(self) -> dict[str, dict[str, str]]: """ @@ -1409,7 +1445,7 @@ async def get_metadata_field_min_max_async(self, metadata_field: str) -> dict[st def get_metadata_field_unique_values( self, metadata_field: str, search_term: str | None, from_: int, size: int - ) -> tuple[list[str], int]: + ) -> list[str]: """ Returns unique values for a metadata field, optionally filtered by a search term in the content. @@ -1463,14 +1499,11 @@ def get_metadata_field_unique_values( paginated_buckets = unique_values_buckets[from_ : from_ + size] unique_values = [str(bucket["key"]) for bucket in paginated_buckets] - # Extract total count from cardinality aggregation - total_count = int(aggregations.get("total_count", {}).get("value", 0)) - - return unique_values, total_count + return unique_values async def get_metadata_field_unique_values_async( self, metadata_field: str, search_term: str | None, from_: int, size: int - ) -> tuple[list[str], int]: + ) -> list[str]: """ Asynchronously returns unique values for a metadata field, optionally filtered by a search term in the content. @@ -1524,7 +1557,4 @@ async def get_metadata_field_unique_values_async( paginated_buckets = unique_values_buckets[from_ : from_ + size] unique_values = [str(bucket["key"]) for bucket in paginated_buckets] - # Extract total count from cardinality aggregation - total_count = int(aggregations.get("total_count", {}).get("value", 0)) - - return unique_values, total_count + return unique_values diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index e02d7a1aaf..85dedf98fa 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -631,14 +631,17 @@ def test_count_unique_metadata_by_filter(self, document_store: OpenSearchDocumen assert document_store.count_documents() == 5 # Count distinct values for all documents - distinct_counts = document_store.count_unique_metadata_by_filter(filters={}) + distinct_counts = document_store.count_unique_metadata_by_filter( + filters={}, fields=["category", "status", "priority"] + ) assert distinct_counts["category"] == 3 # A, B, C assert distinct_counts["status"] == 2 # active, inactive assert distinct_counts["priority"] == 3 # 1, 2, 3 # Count distinct values for documents with category="A" distinct_counts_a = document_store.count_unique_metadata_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "A"} + filters={"field": "meta.category", "operator": "==", "value": "A"}, + fields=["category", "status", "priority"], ) assert distinct_counts_a["category"] == 1 # Only A assert distinct_counts_a["status"] == 2 # active, inactive @@ -646,7 +649,8 @@ def test_count_unique_metadata_by_filter(self, document_store: OpenSearchDocumen # Count distinct values for documents with status="active" distinct_counts_active = document_store.count_unique_metadata_by_filter( - filters={"field": "meta.status", "operator": "==", "value": "active"} + filters={"field": "meta.status", "operator": "==", "value": "active"}, + fields=["category", "status", "priority"], ) assert distinct_counts_active["category"] == 3 # A, B, C assert distinct_counts_active["status"] == 1 # Only active @@ -660,12 +664,33 @@ def test_count_unique_metadata_by_filter(self, document_store: OpenSearchDocumen {"field": "meta.category", "operator": "==", "value": "A"}, {"field": "meta.status", "operator": "==", "value": "active"}, ], - } + }, + fields=["category", "status", "priority"], ) assert distinct_counts_a_active["category"] == 1 # Only A assert distinct_counts_a_active["status"] == 1 # Only active assert distinct_counts_a_active["priority"] == 2 # 1, 3 + # Test with only a subset of fields + distinct_counts_subset = document_store.count_unique_metadata_by_filter( + filters={}, fields=["category", "status"] + ) + assert distinct_counts_subset["category"] == 3 + assert distinct_counts_subset["status"] == 2 + assert "priority" not in distinct_counts_subset + + # Test field name normalization (with "meta." prefix) + distinct_counts_normalized = document_store.count_unique_metadata_by_filter( + filters={}, fields=["meta.category", "status", "meta.priority"] + ) + assert distinct_counts_normalized["category"] == 3 + assert distinct_counts_normalized["status"] == 2 + assert distinct_counts_normalized["priority"] == 3 + + # Test error handling when field doesn't exist + with pytest.raises(ValueError, match="Fields not found in index mapping"): + document_store.count_unique_metadata_by_filter(filters={}, fields=["nonexistent_field"]) + def test_get_metadata_fields_info(self, document_store: OpenSearchDocumentStore): docs = [ Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1}), @@ -733,40 +758,30 @@ def test_get_metadata_field_unique_values(self, document_store: OpenSearchDocume document_store.write_documents(docs) # Test getting all unique values without search term - unique_values, total_count = document_store.get_metadata_field_unique_values("meta.category", None, 0, 10) + unique_values = document_store.get_metadata_field_unique_values("meta.category", None, 0, 10) assert set(unique_values) == {"A", "B", "C"} - assert total_count == 3 # Test with "meta." prefix - unique_languages, lang_count = document_store.get_metadata_field_unique_values("meta.language", None, 0, 10) + unique_languages = document_store.get_metadata_field_unique_values("meta.language", None, 0, 10) assert set(unique_languages) == {"Python", "Java", "JavaScript"} - assert lang_count == 3 # Test pagination - first page - unique_values_page1, total_count = document_store.get_metadata_field_unique_values("meta.category", None, 0, 2) + unique_values_page1 = document_store.get_metadata_field_unique_values("meta.category", None, 0, 2) assert len(unique_values_page1) == 2 - assert total_count == 3 assert all(val in ["A", "B", "C"] for val in unique_values_page1) # Test pagination - second page - unique_values_page2, total_count = document_store.get_metadata_field_unique_values("meta.category", None, 2, 2) + unique_values_page2 = document_store.get_metadata_field_unique_values("meta.category", None, 2, 2) assert len(unique_values_page2) == 1 - assert total_count == 3 assert unique_values_page2[0] in ["A", "B", "C"] # Test with search term - filter by content matching "Python" - unique_values_filtered, total_count = document_store.get_metadata_field_unique_values( - "meta.category", "Python", 0, 10 - ) + unique_values_filtered = document_store.get_metadata_field_unique_values("meta.category", "Python", 0, 10) assert set(unique_values_filtered) == {"A"} # Only category A has documents with "Python" in content - assert total_count == 1 # Test with search term - filter by content matching "Java" - unique_values_java, total_count = document_store.get_metadata_field_unique_values( - "meta.category", "Java", 0, 10 - ) + unique_values_java = document_store.get_metadata_field_unique_values("meta.category", "Java", 0, 10) assert set(unique_values_java) == {"B"} # Only category B has documents with "Java" in content - assert total_count == 1 # Test with integer values int_docs = [ @@ -776,18 +791,12 @@ def test_get_metadata_field_unique_values(self, document_store: OpenSearchDocume Document(content="Doc 4", meta={"priority": 3}), ] document_store.write_documents(int_docs) - unique_priorities, priority_count = document_store.get_metadata_field_unique_values( - "meta.priority", None, 0, 10 - ) + unique_priorities = document_store.get_metadata_field_unique_values("meta.priority", None, 0, 10) assert set(unique_priorities) == {"1", "2", "3"} - assert priority_count == 3 # Test with search term on integer field - unique_priorities_filtered, priority_count = document_store.get_metadata_field_unique_values( - "meta.priority", "Doc 1", 0, 10 - ) + unique_priorities_filtered = document_store.get_metadata_field_unique_values("meta.priority", "Doc 1", 0, 10) assert set(unique_priorities_filtered) == {"1"} - assert priority_count == 1 @pytest.mark.integration def test_write_with_routing(self, document_store: OpenSearchDocumentStore): diff --git a/integrations/opensearch/tests/test_document_store_async.py b/integrations/opensearch/tests/test_document_store_async.py index a1b5271400..6d438c86ba 100644 --- a/integrations/opensearch/tests/test_document_store_async.py +++ b/integrations/opensearch/tests/test_document_store_async.py @@ -281,14 +281,17 @@ async def test_count_unique_metadata_by_filter(self, document_store: OpenSearchD assert await document_store.count_documents_async() == 5 # count distinct values for all documents - distinct_counts = await document_store.count_unique_metadata_by_filter_async(filters={}) + distinct_counts = await document_store.count_unique_metadata_by_filter_async( + filters={}, fields=["category", "status", "priority"] + ) assert distinct_counts["category"] == 3 # A, B, C assert distinct_counts["status"] == 2 # active, inactive assert distinct_counts["priority"] == 3 # 1, 2, 3 # count distinct values for documents with category="A" distinct_counts_a = await document_store.count_unique_metadata_by_filter_async( - filters={"field": "meta.category", "operator": "==", "value": "A"} + filters={"field": "meta.category", "operator": "==", "value": "A"}, + fields=["category", "status", "priority"], ) assert distinct_counts_a["category"] == 1 # Only A assert distinct_counts_a["status"] == 2 # active, inactive @@ -296,7 +299,8 @@ async def test_count_unique_metadata_by_filter(self, document_store: OpenSearchD # count distinct values for documents with status="active" distinct_counts_active = await document_store.count_unique_metadata_by_filter_async( - filters={"field": "meta.status", "operator": "==", "value": "active"} + filters={"field": "meta.status", "operator": "==", "value": "active"}, + fields=["category", "status", "priority"], ) assert distinct_counts_active["category"] == 3 # A, B, C assert distinct_counts_active["status"] == 1 # Only active @@ -310,12 +314,33 @@ async def test_count_unique_metadata_by_filter(self, document_store: OpenSearchD {"field": "meta.category", "operator": "==", "value": "A"}, {"field": "meta.status", "operator": "==", "value": "active"}, ], - } + }, + fields=["category", "status", "priority"], ) assert distinct_counts_a_active["category"] == 1 # Only A assert distinct_counts_a_active["status"] == 1 # Only active assert distinct_counts_a_active["priority"] == 2 # 1, 3 + # Test with only a subset of fields + distinct_counts_subset = await document_store.count_unique_metadata_by_filter_async( + filters={}, fields=["category", "status"] + ) + assert distinct_counts_subset["category"] == 3 + assert distinct_counts_subset["status"] == 2 + assert "priority" not in distinct_counts_subset + + # Test field name normalization (with "meta." prefix) + distinct_counts_normalized = await document_store.count_unique_metadata_by_filter_async( + filters={}, fields=["meta.category", "status", "meta.priority"] + ) + assert distinct_counts_normalized["category"] == 3 + assert distinct_counts_normalized["status"] == 2 + assert distinct_counts_normalized["priority"] == 3 + + # Test error handling when field doesn't exist + with pytest.raises(ValueError, match="Fields not found in index mapping"): + await document_store.count_unique_metadata_by_filter_async(filters={}, fields=["nonexistent_field"]) + @pytest.mark.asyncio async def test_delete_documents(self, document_store: OpenSearchDocumentStore): doc = Document(content="test doc") @@ -542,48 +567,32 @@ async def test_get_metadata_field_unique_values_async(self, document_store: Open await document_store.write_documents_async(docs) # Test getting all unique values without search term - unique_values, total_count = await document_store.get_metadata_field_unique_values_async( - "meta.category", None, 0, 10 - ) + unique_values = await document_store.get_metadata_field_unique_values_async("meta.category", None, 0, 10) assert set(unique_values) == {"A", "B", "C"} - assert total_count == 3 # Test with "meta." prefix - unique_languages, lang_count = await document_store.get_metadata_field_unique_values_async( - "meta.language", None, 0, 10 - ) + unique_languages = await document_store.get_metadata_field_unique_values_async("meta.language", None, 0, 10) assert set(unique_languages) == {"Python", "Java", "JavaScript"} - assert lang_count == 3 # Test pagination - first page - unique_values_page1, total_count = await document_store.get_metadata_field_unique_values_async( - "meta.category", None, 0, 2 - ) + unique_values_page1 = await document_store.get_metadata_field_unique_values_async("meta.category", None, 0, 2) assert len(unique_values_page1) == 2 - assert total_count == 3 assert all(val in ["A", "B", "C"] for val in unique_values_page1) # Test pagination - second page - unique_values_page2, total_count = await document_store.get_metadata_field_unique_values_async( - "meta.category", None, 2, 2 - ) + unique_values_page2 = await document_store.get_metadata_field_unique_values_async("meta.category", None, 2, 2) assert len(unique_values_page2) == 1 - assert total_count == 3 assert unique_values_page2[0] in ["A", "B", "C"] # Test with search term - filter by content matching "Python" - unique_values_filtered, total_count = await document_store.get_metadata_field_unique_values_async( + unique_values_filtered = await document_store.get_metadata_field_unique_values_async( "meta.category", "Python", 0, 10 ) assert set(unique_values_filtered) == {"A"} # Only category A has documents with "Python" in content - assert total_count == 1 # Test with search term - filter by content matching "Java" - unique_values_java, total_count = await document_store.get_metadata_field_unique_values_async( - "meta.category", "Java", 0, 10 - ) + unique_values_java = await document_store.get_metadata_field_unique_values_async("meta.category", "Java", 0, 10) assert set(unique_values_java) == {"B"} # Only category B has documents with "Java" in content - assert total_count == 1 # Test with integer values int_docs = [ @@ -593,15 +602,11 @@ async def test_get_metadata_field_unique_values_async(self, document_store: Open Document(content="Doc 4", meta={"priority": 3}), ] await document_store.write_documents_async(int_docs) - unique_priorities, priority_count = await document_store.get_metadata_field_unique_values_async( - "meta.priority", None, 0, 10 - ) + unique_priorities = await document_store.get_metadata_field_unique_values_async("meta.priority", None, 0, 10) assert set(unique_priorities) == {"1", "2", "3"} - assert priority_count == 3 # Test with search term on integer field - unique_priorities_filtered, priority_count = await document_store.get_metadata_field_unique_values_async( + unique_priorities_filtered = await document_store.get_metadata_field_unique_values_async( "meta.priority", "Doc 1", 0, 10 ) assert set(unique_priorities_filtered) == {"1"} - assert priority_count == 1 From 1f1dc00791bea568e0291a720aaf31db43a2ba0b Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 15 Jan 2026 14:37:45 +0100 Subject: [PATCH 56/58] updating params name --- .../opensearch/document_store.py | 26 ++++++++++--------- .../opensearch/tests/test_document_store.py | 14 +++++----- .../tests/test_document_store_async.py | 16 +++++++----- 3 files changed, 30 insertions(+), 26 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index ee1e3d8cc4..71a19ffe3a 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1232,14 +1232,14 @@ def _extract_distinct_counts_from_aggregations( distinct_counts[field_name] = aggregations[agg_key]["value"] return distinct_counts - def count_unique_metadata_by_filter(self, filters: dict[str, Any], fields: list[str]) -> dict[str, int]: + def count_unique_metadata_by_filter(self, filters: dict[str, Any], metadata_fields: list[str]) -> dict[str, int]: """ Returns the number of unique values for each specified metadata field of the documents that match the provided filters. :param filters: The filters to apply to count documents. For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) - :param fields: List of field names to calculate unique values for. + :param metadata_fields: List of field names to calculate unique values for. Field names can include or omit the "meta." prefix. :returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered documents. @@ -1253,15 +1253,15 @@ def count_unique_metadata_by_filter(self, filters: dict[str, Any], fields: list[ index_mapping = mapping[self._index]["mappings"]["properties"] # normalize field names - normalized_fields = [self._normalize_metadata_field_name(field) for field in fields] + normalized_metadata_fields = [self._normalize_metadata_field_name(field) for field in metadata_fields] # validate that all requested fields exist in the index mapping - missing_fields = [f for f in normalized_fields if f not in index_mapping] + missing_fields = [f for f in normalized_metadata_fields if f not in index_mapping] if missing_fields: msg = f"Fields not found in index mapping: {missing_fields}" raise ValueError(msg) # build aggregations for specified metadata fields - aggs = self._build_cardinality_aggregations(index_mapping, normalized_fields) + aggs = self._build_cardinality_aggregations(index_mapping, normalized_metadata_fields) if not aggs: return {} @@ -1271,17 +1271,19 @@ def count_unique_metadata_by_filter(self, filters: dict[str, Any], fields: list[ # extract cardinality values from aggregations return self._extract_distinct_counts_from_aggregations( - result.get("aggregations", {}), index_mapping, normalized_fields + result.get("aggregations", {}), index_mapping, normalized_metadata_fields ) - async def count_unique_metadata_by_filter_async(self, filters: dict[str, Any], fields: list[str]) -> dict[str, int]: + async def count_unique_metadata_by_filter_async( + self, filters: dict[str, Any], metadata_fields: list[str] + ) -> dict[str, int]: """ Asynchronously returns the number of unique values for each specified metadata field of the documents that match the provided filters. :param filters: The filters to apply to count documents. For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) - :param fields: List of field names to calculate unique values for. + :param metadata_fields: List of field names to calculate unique values for. Field names can include or omit the "meta." prefix. :returns: A dictionary mapping each metadata field name to the count of its unique values among the filtered documents. @@ -1295,15 +1297,15 @@ async def count_unique_metadata_by_filter_async(self, filters: dict[str, Any], f index_mapping = mapping[self._index]["mappings"]["properties"] # normalize field names - normalized_fields = [self._normalize_metadata_field_name(field) for field in fields] + normalized_metadata_fields = [self._normalize_metadata_field_name(field) for field in metadata_fields] # validate that all requested fields exist in the index mapping - missing_fields = [f for f in normalized_fields if f not in index_mapping] + missing_fields = [f for f in normalized_metadata_fields if f not in index_mapping] if missing_fields: msg = f"Fields not found in index mapping: {missing_fields}" raise ValueError(msg) # build aggregations for specified metadata fields - aggs = self._build_cardinality_aggregations(index_mapping, normalized_fields) + aggs = self._build_cardinality_aggregations(index_mapping, normalized_metadata_fields) if not aggs: return {} @@ -1313,7 +1315,7 @@ async def count_unique_metadata_by_filter_async(self, filters: dict[str, Any], f # extract cardinality values from aggregations return self._extract_distinct_counts_from_aggregations( - result.get("aggregations", {}), index_mapping, normalized_fields + result.get("aggregations", {}), index_mapping, normalized_metadata_fields ) def get_metadata_fields_info(self) -> dict[str, dict[str, str]]: diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index 85dedf98fa..bad19c4dbe 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -632,7 +632,7 @@ def test_count_unique_metadata_by_filter(self, document_store: OpenSearchDocumen # Count distinct values for all documents distinct_counts = document_store.count_unique_metadata_by_filter( - filters={}, fields=["category", "status", "priority"] + filters={}, metadata_fields=["category", "status", "priority"] ) assert distinct_counts["category"] == 3 # A, B, C assert distinct_counts["status"] == 2 # active, inactive @@ -641,7 +641,7 @@ def test_count_unique_metadata_by_filter(self, document_store: OpenSearchDocumen # Count distinct values for documents with category="A" distinct_counts_a = document_store.count_unique_metadata_by_filter( filters={"field": "meta.category", "operator": "==", "value": "A"}, - fields=["category", "status", "priority"], + metadata_fields=["category", "status", "priority"], ) assert distinct_counts_a["category"] == 1 # Only A assert distinct_counts_a["status"] == 2 # active, inactive @@ -650,7 +650,7 @@ def test_count_unique_metadata_by_filter(self, document_store: OpenSearchDocumen # Count distinct values for documents with status="active" distinct_counts_active = document_store.count_unique_metadata_by_filter( filters={"field": "meta.status", "operator": "==", "value": "active"}, - fields=["category", "status", "priority"], + metadata_fields=["category", "status", "priority"], ) assert distinct_counts_active["category"] == 3 # A, B, C assert distinct_counts_active["status"] == 1 # Only active @@ -665,7 +665,7 @@ def test_count_unique_metadata_by_filter(self, document_store: OpenSearchDocumen {"field": "meta.status", "operator": "==", "value": "active"}, ], }, - fields=["category", "status", "priority"], + metadata_fields=["category", "status", "priority"], ) assert distinct_counts_a_active["category"] == 1 # Only A assert distinct_counts_a_active["status"] == 1 # Only active @@ -673,7 +673,7 @@ def test_count_unique_metadata_by_filter(self, document_store: OpenSearchDocumen # Test with only a subset of fields distinct_counts_subset = document_store.count_unique_metadata_by_filter( - filters={}, fields=["category", "status"] + filters={}, metadata_fields=["category", "status"] ) assert distinct_counts_subset["category"] == 3 assert distinct_counts_subset["status"] == 2 @@ -681,7 +681,7 @@ def test_count_unique_metadata_by_filter(self, document_store: OpenSearchDocumen # Test field name normalization (with "meta." prefix) distinct_counts_normalized = document_store.count_unique_metadata_by_filter( - filters={}, fields=["meta.category", "status", "meta.priority"] + filters={}, metadata_fields=["meta.category", "status", "meta.priority"] ) assert distinct_counts_normalized["category"] == 3 assert distinct_counts_normalized["status"] == 2 @@ -689,7 +689,7 @@ def test_count_unique_metadata_by_filter(self, document_store: OpenSearchDocumen # Test error handling when field doesn't exist with pytest.raises(ValueError, match="Fields not found in index mapping"): - document_store.count_unique_metadata_by_filter(filters={}, fields=["nonexistent_field"]) + document_store.count_unique_metadata_by_filter(filters={}, metadata_fields=["nonexistent_field"]) def test_get_metadata_fields_info(self, document_store: OpenSearchDocumentStore): docs = [ diff --git a/integrations/opensearch/tests/test_document_store_async.py b/integrations/opensearch/tests/test_document_store_async.py index 6d438c86ba..b33ee2677e 100644 --- a/integrations/opensearch/tests/test_document_store_async.py +++ b/integrations/opensearch/tests/test_document_store_async.py @@ -282,7 +282,7 @@ async def test_count_unique_metadata_by_filter(self, document_store: OpenSearchD # count distinct values for all documents distinct_counts = await document_store.count_unique_metadata_by_filter_async( - filters={}, fields=["category", "status", "priority"] + filters={}, metadata_fields=["category", "status", "priority"] ) assert distinct_counts["category"] == 3 # A, B, C assert distinct_counts["status"] == 2 # active, inactive @@ -291,7 +291,7 @@ async def test_count_unique_metadata_by_filter(self, document_store: OpenSearchD # count distinct values for documents with category="A" distinct_counts_a = await document_store.count_unique_metadata_by_filter_async( filters={"field": "meta.category", "operator": "==", "value": "A"}, - fields=["category", "status", "priority"], + metadata_fields=["category", "status", "priority"], ) assert distinct_counts_a["category"] == 1 # Only A assert distinct_counts_a["status"] == 2 # active, inactive @@ -300,7 +300,7 @@ async def test_count_unique_metadata_by_filter(self, document_store: OpenSearchD # count distinct values for documents with status="active" distinct_counts_active = await document_store.count_unique_metadata_by_filter_async( filters={"field": "meta.status", "operator": "==", "value": "active"}, - fields=["category", "status", "priority"], + metadata_fields=["category", "status", "priority"], ) assert distinct_counts_active["category"] == 3 # A, B, C assert distinct_counts_active["status"] == 1 # Only active @@ -315,7 +315,7 @@ async def test_count_unique_metadata_by_filter(self, document_store: OpenSearchD {"field": "meta.status", "operator": "==", "value": "active"}, ], }, - fields=["category", "status", "priority"], + metadata_fields=["category", "status", "priority"], ) assert distinct_counts_a_active["category"] == 1 # Only A assert distinct_counts_a_active["status"] == 1 # Only active @@ -323,7 +323,7 @@ async def test_count_unique_metadata_by_filter(self, document_store: OpenSearchD # Test with only a subset of fields distinct_counts_subset = await document_store.count_unique_metadata_by_filter_async( - filters={}, fields=["category", "status"] + filters={}, metadata_fields=["category", "status"] ) assert distinct_counts_subset["category"] == 3 assert distinct_counts_subset["status"] == 2 @@ -331,7 +331,7 @@ async def test_count_unique_metadata_by_filter(self, document_store: OpenSearchD # Test field name normalization (with "meta." prefix) distinct_counts_normalized = await document_store.count_unique_metadata_by_filter_async( - filters={}, fields=["meta.category", "status", "meta.priority"] + filters={}, metadata_fields=["meta.category", "status", "meta.priority"] ) assert distinct_counts_normalized["category"] == 3 assert distinct_counts_normalized["status"] == 2 @@ -339,7 +339,9 @@ async def test_count_unique_metadata_by_filter(self, document_store: OpenSearchD # Test error handling when field doesn't exist with pytest.raises(ValueError, match="Fields not found in index mapping"): - await document_store.count_unique_metadata_by_filter_async(filters={}, fields=["nonexistent_field"]) + await document_store.count_unique_metadata_by_filter_async( + filters={}, metadata_fields=["nonexistent_field"] + ) @pytest.mark.asyncio async def test_delete_documents(self, document_store: OpenSearchDocumentStore): From da07149743177904dbf6f5473316501733b1bc6c Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 15 Jan 2026 15:07:22 +0100 Subject: [PATCH 57/58] updating document_store.get_metadata_field_unique_values --- .../opensearch/document_store.py | 122 ++++++++++-------- .../opensearch/tests/test_document_store.py | 26 ++-- .../tests/test_document_store_async.py | 34 +++-- 3 files changed, 108 insertions(+), 74 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 71a19ffe3a..3bfe7a3675 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -1446,16 +1446,24 @@ async def get_metadata_field_min_max_async(self, metadata_field: str) -> dict[st return self._extract_min_max_from_stats(stats) def get_metadata_field_unique_values( - self, metadata_field: str, search_term: str | None, from_: int, size: int - ) -> list[str]: + self, + metadata_field: str, + search_term: str | None = None, + size: int | None = 10000, + after: dict[str, Any] | None = None, + ) -> tuple[list[str], dict[str, Any] | None]: """ Returns unique values for a metadata field, optionally filtered by a search term in the content. + Uses composite aggregations for proper pagination beyond 10k results. :param metadata_field: The metadata field to get unique values for. :param search_term: Optional search term to filter documents by matching in the content field. - :param from_: The starting index for pagination. - :param size: The number of unique values to return. - :returns: A tuple containing (list of unique values, total count of unique values). + :param size: The number of unique values to return per page. Defaults to 10000. + :param after: Optional pagination key from the previous response. Use None for the first page. + For subsequent pages, pass the `after_key` from the previous response. + :returns: A tuple containing (list of unique values, after_key for pagination). + The after_key is None when there are no more results. Use it in the `after` parameter + for the next page. """ self._ensure_initialized() assert self._client is not None @@ -1468,26 +1476,20 @@ def get_metadata_field_unique_values( # Use match_phrase for exact phrase matching to avoid tokenization issues query = {"match_phrase": {"content": search_term}} - # Build aggregations - # Terms aggregation for paginated unique values - # Note: Terms aggregation doesn't support 'from' parameter directly, - # so we fetch from_ + size results and slice them - # Cardinality aggregation for total count - terms_size = from_ + size if from_ > 0 else size + # Build composite aggregation for proper pagination + composite_agg: dict[str, Any] = { + "size": size, + "sources": [{field_name: {"terms": {"field": field_name}}}], + } + if after is not None: + composite_agg["after"] = after + body = { "query": query, "aggs": { "unique_values": { - "terms": { - "field": field_name, - "size": terms_size, - } - }, - "total_count": { - "cardinality": { - "field": field_name, - } - }, + "composite": composite_agg, + } }, "size": 0, # we only need aggregations, not documents } @@ -1495,25 +1497,38 @@ def get_metadata_field_unique_values( result = self._client.search(index=self._index, body=body) aggregations = result.get("aggregations", {}) - # Extract unique values from terms aggregation buckets - unique_values_buckets = aggregations.get("unique_values", {}).get("buckets", []) - # Apply pagination by slicing the results - paginated_buckets = unique_values_buckets[from_ : from_ + size] - unique_values = [str(bucket["key"]) for bucket in paginated_buckets] + # Extract unique values from composite aggregation buckets + unique_values_agg = aggregations.get("unique_values", {}) + unique_values_buckets = unique_values_agg.get("buckets", []) + unique_values = [str(bucket["key"][field_name]) for bucket in unique_values_buckets] + + # Extract after_key for pagination + # If we got fewer results than requested, we've reached the end + after_key = unique_values_agg.get("after_key") + if after_key is not None and size is not None and len(unique_values_buckets) < size: + after_key = None - return unique_values + return unique_values, after_key async def get_metadata_field_unique_values_async( - self, metadata_field: str, search_term: str | None, from_: int, size: int - ) -> list[str]: + self, + metadata_field: str, + search_term: str | None = None, + size: int | None = 10000, + after: dict[str, Any] | None = None, + ) -> tuple[list[str], dict[str, Any] | None]: """ Asynchronously returns unique values for a metadata field, optionally filtered by a search term in the content. + Uses composite aggregations for proper pagination beyond 10k results. :param metadata_field: The metadata field to get unique values for. :param search_term: Optional search term to filter documents by matching in the content field. - :param from_: The starting index for pagination. - :param size: The number of unique values to return. - :returns: A tuple containing (list of unique values, total count of unique values). + :param size: The number of unique values to return per page. Defaults to 10000. + :param after: Optional pagination key from the previous response. Use None for the first page. + For subsequent pages, pass the `after_key` from the previous response. + :returns: A tuple containing (list of unique values, after_key for pagination). + The after_key is None when there are no more results. Use it in the `after` parameter + for the next page. """ await self._ensure_initialized_async() assert self._async_client is not None @@ -1526,26 +1541,20 @@ async def get_metadata_field_unique_values_async( # Use match_phrase for exact phrase matching to avoid tokenization issues query = {"match_phrase": {"content": search_term}} - # Build aggregations - # Terms aggregation for paginated unique values - # Note: Terms aggregation doesn't support 'from' parameter directly, - # so we fetch from_ + size results and slice them - # Cardinality aggregation for total count - terms_size = from_ + size if from_ > 0 else size + # Build composite aggregation for proper pagination + composite_agg: dict[str, Any] = { + "size": size, + "sources": [{field_name: {"terms": {"field": field_name}}}], + } + if after is not None: + composite_agg["after"] = after + body = { "query": query, "aggs": { "unique_values": { - "terms": { - "field": field_name, - "size": terms_size, - } - }, - "total_count": { - "cardinality": { - "field": field_name, - } - }, + "composite": composite_agg, + } }, "size": 0, # we only need aggregations, not documents } @@ -1553,10 +1562,15 @@ async def get_metadata_field_unique_values_async( result = await self._async_client.search(index=self._index, body=body) aggregations = result.get("aggregations", {}) - # Extract unique values from terms aggregation buckets - unique_values_buckets = aggregations.get("unique_values", {}).get("buckets", []) - # Apply pagination by slicing the results - paginated_buckets = unique_values_buckets[from_ : from_ + size] - unique_values = [str(bucket["key"]) for bucket in paginated_buckets] + # Extract unique values from composite aggregation buckets + unique_values_agg = aggregations.get("unique_values", {}) + unique_values_buckets = unique_values_agg.get("buckets", []) + unique_values = [str(bucket["key"][field_name]) for bucket in unique_values_buckets] + + # Extract after_key for pagination + # If we got fewer results than requested, we've reached the end + after_key = unique_values_agg.get("after_key") + if after_key is not None and size is not None and len(unique_values_buckets) < size: + after_key = None - return unique_values + return unique_values, after_key diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index bad19c4dbe..c45288b3d7 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -758,29 +758,37 @@ def test_get_metadata_field_unique_values(self, document_store: OpenSearchDocume document_store.write_documents(docs) # Test getting all unique values without search term - unique_values = document_store.get_metadata_field_unique_values("meta.category", None, 0, 10) + unique_values, after_key = document_store.get_metadata_field_unique_values("meta.category", None, 10) assert set(unique_values) == {"A", "B", "C"} + # after_key should be None when all results are returned + assert after_key is None # Test with "meta." prefix - unique_languages = document_store.get_metadata_field_unique_values("meta.language", None, 0, 10) + unique_languages, _ = document_store.get_metadata_field_unique_values("meta.language", None, 10) assert set(unique_languages) == {"Python", "Java", "JavaScript"} # Test pagination - first page - unique_values_page1 = document_store.get_metadata_field_unique_values("meta.category", None, 0, 2) + unique_values_page1, after_key_page1 = document_store.get_metadata_field_unique_values("meta.category", None, 2) assert len(unique_values_page1) == 2 assert all(val in ["A", "B", "C"] for val in unique_values_page1) + # Should have an after_key for pagination + assert after_key_page1 is not None - # Test pagination - second page - unique_values_page2 = document_store.get_metadata_field_unique_values("meta.category", None, 2, 2) + # Test pagination - second page using after_key + unique_values_page2, after_key_page2 = document_store.get_metadata_field_unique_values( + "meta.category", None, 2, after=after_key_page1 + ) assert len(unique_values_page2) == 1 assert unique_values_page2[0] in ["A", "B", "C"] + # Should have no more results + assert after_key_page2 is None # Test with search term - filter by content matching "Python" - unique_values_filtered = document_store.get_metadata_field_unique_values("meta.category", "Python", 0, 10) + unique_values_filtered, _ = document_store.get_metadata_field_unique_values("meta.category", "Python", 10) assert set(unique_values_filtered) == {"A"} # Only category A has documents with "Python" in content # Test with search term - filter by content matching "Java" - unique_values_java = document_store.get_metadata_field_unique_values("meta.category", "Java", 0, 10) + unique_values_java, _ = document_store.get_metadata_field_unique_values("meta.category", "Java", 10) assert set(unique_values_java) == {"B"} # Only category B has documents with "Java" in content # Test with integer values @@ -791,11 +799,11 @@ def test_get_metadata_field_unique_values(self, document_store: OpenSearchDocume Document(content="Doc 4", meta={"priority": 3}), ] document_store.write_documents(int_docs) - unique_priorities = document_store.get_metadata_field_unique_values("meta.priority", None, 0, 10) + unique_priorities, _ = document_store.get_metadata_field_unique_values("meta.priority", None, 10) assert set(unique_priorities) == {"1", "2", "3"} # Test with search term on integer field - unique_priorities_filtered = document_store.get_metadata_field_unique_values("meta.priority", "Doc 1", 0, 10) + unique_priorities_filtered, _ = document_store.get_metadata_field_unique_values("meta.priority", "Doc 1", 10) assert set(unique_priorities_filtered) == {"1"} @pytest.mark.integration diff --git a/integrations/opensearch/tests/test_document_store_async.py b/integrations/opensearch/tests/test_document_store_async.py index b33ee2677e..18ceb3c4de 100644 --- a/integrations/opensearch/tests/test_document_store_async.py +++ b/integrations/opensearch/tests/test_document_store_async.py @@ -569,31 +569,43 @@ async def test_get_metadata_field_unique_values_async(self, document_store: Open await document_store.write_documents_async(docs) # Test getting all unique values without search term - unique_values = await document_store.get_metadata_field_unique_values_async("meta.category", None, 0, 10) + unique_values, after_key = await document_store.get_metadata_field_unique_values_async( + "meta.category", None, 10 + ) assert set(unique_values) == {"A", "B", "C"} + # after_key should be None when all results are returned + assert after_key is None # Test with "meta." prefix - unique_languages = await document_store.get_metadata_field_unique_values_async("meta.language", None, 0, 10) + unique_languages, _ = await document_store.get_metadata_field_unique_values_async("meta.language", None, 10) assert set(unique_languages) == {"Python", "Java", "JavaScript"} # Test pagination - first page - unique_values_page1 = await document_store.get_metadata_field_unique_values_async("meta.category", None, 0, 2) + unique_values_page1, after_key_page1 = await document_store.get_metadata_field_unique_values_async( + "meta.category", None, 2 + ) assert len(unique_values_page1) == 2 assert all(val in ["A", "B", "C"] for val in unique_values_page1) + # Should have an after_key for pagination + assert after_key_page1 is not None - # Test pagination - second page - unique_values_page2 = await document_store.get_metadata_field_unique_values_async("meta.category", None, 2, 2) + # Test pagination - second page using after_key + unique_values_page2, after_key_page2 = await document_store.get_metadata_field_unique_values_async( + "meta.category", None, 2, after=after_key_page1 + ) assert len(unique_values_page2) == 1 assert unique_values_page2[0] in ["A", "B", "C"] + # Should have no more results + assert after_key_page2 is None # Test with search term - filter by content matching "Python" - unique_values_filtered = await document_store.get_metadata_field_unique_values_async( - "meta.category", "Python", 0, 10 + unique_values_filtered, _ = await document_store.get_metadata_field_unique_values_async( + "meta.category", "Python", 10 ) assert set(unique_values_filtered) == {"A"} # Only category A has documents with "Python" in content # Test with search term - filter by content matching "Java" - unique_values_java = await document_store.get_metadata_field_unique_values_async("meta.category", "Java", 0, 10) + unique_values_java, _ = await document_store.get_metadata_field_unique_values_async("meta.category", "Java", 10) assert set(unique_values_java) == {"B"} # Only category B has documents with "Java" in content # Test with integer values @@ -604,11 +616,11 @@ async def test_get_metadata_field_unique_values_async(self, document_store: Open Document(content="Doc 4", meta={"priority": 3}), ] await document_store.write_documents_async(int_docs) - unique_priorities = await document_store.get_metadata_field_unique_values_async("meta.priority", None, 0, 10) + unique_priorities, _ = await document_store.get_metadata_field_unique_values_async("meta.priority", None, 10) assert set(unique_priorities) == {"1", "2", "3"} # Test with search term on integer field - unique_priorities_filtered = await document_store.get_metadata_field_unique_values_async( - "meta.priority", "Doc 1", 0, 10 + unique_priorities_filtered, _ = await document_store.get_metadata_field_unique_values_async( + "meta.priority", "Doc 1", 10 ) assert set(unique_priorities_filtered) == {"1"} From 27e1eaee166aba285f0b527f386cb38f4466fbad Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 16 Jan 2026 10:57:13 +0100 Subject: [PATCH 58/58] removing ResponseFormat --- .../document_stores/opensearch/document_store.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 3bfe7a3675..41b9c9abf3 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -23,8 +23,6 @@ SPECIAL_FIELDS = {"content", "embedding", "id", "score", "sparse_embedding", "blob"} -ResponseFormat = Literal["json", "jdbc", "csv", "raw"] - Hosts = str | list[str | Mapping[str, str | int]] # document scores are essentially unbounded and will be scaled to values between 0 and 1 if scale_score is set to