diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py index 65d91d2a9b..c8bfa38eaa 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py @@ -15,7 +15,7 @@ from haystack.document_stores.types import DuplicatePolicy from haystack.utils.auth import Secret from haystack.utils.misc import _normalize_metadata_field_name -from opensearchpy import AsyncHttpConnection, AsyncOpenSearch, OpenSearch, TransportError +from opensearchpy import AsyncHttpConnection, AsyncOpenSearch, OpenSearch, RequestError, TransportError from opensearchpy.helpers import async_bulk, bulk from haystack_integrations.document_stores.opensearch.auth import AsyncAWSAuth, AWSAuth @@ -91,6 +91,7 @@ def __init__( use_ssl: bool | None = None, verify_certs: bool | None = None, timeout: int | None = None, + nested_fields: list[str] | Literal["*"] | None = None, **kwargs: Any, ) -> None: """ @@ -128,6 +129,12 @@ def __init__( :param use_ssl: Whether to use SSL. Defaults to None :param verify_certs: Whether to verify certificates. Defaults to None :param timeout: Timeout in seconds. Defaults to None + :param nested_fields: List of metadata field paths (without the `meta.` prefix) that should be mapped + as OpenSearch `nested` type, enabling multi-condition filtering on array-of-objects fields. + Pass `"*"` to auto-detect `list[dict]` fields and map them as nested from + the first `write_documents` batch. + When the index already exists, nested fields are discovered from the live mapping. + Defaults to None (no nested support). :param **kwargs: Optional arguments that ``OpenSearch`` takes. For the full list of supported kwargs, see the [official OpenSearch reference](https://opensearch-project.github.io/opensearch-py/api-ref/clients/opensearch_client.html) """ @@ -137,6 +144,7 @@ def __init__( self._embedding_dim = embedding_dim self._return_embedding = return_embedding self._method = method + self._nested_fields = nested_fields self._mappings = mappings or self._get_default_mappings() self._settings = settings self._create_index = create_index @@ -146,6 +154,10 @@ def __init__( self._timeout = timeout self._kwargs = kwargs + self._resolved_nested_fields: set[str] = set() + if nested_fields and nested_fields != "*": + self._resolved_nested_fields = set(nested_fields) + # Client is initialized lazily to prevent side effects when # the document store is instantiated. self._client: OpenSearch | None = None @@ -169,6 +181,9 @@ def _get_default_mappings(self) -> dict[str, Any]: } if self._method: default_mappings["properties"]["embedding"]["method"] = self._method + if self._nested_fields and self._nested_fields != "*": + for field in self._nested_fields: + default_mappings["properties"][field] = {"type": "nested"} return default_mappings def create_index( @@ -235,6 +250,7 @@ def to_dict(self) -> dict[str, Any]: use_ssl=self._use_ssl, verify_certs=self._verify_certs, timeout=self._timeout, + nested_fields=self._nested_fields, **self._kwargs, ) @@ -309,6 +325,71 @@ async def _ensure_initialized_async(self) -> None: self._initialized = True await self._ensure_index_exists_async() + @staticmethod + def _extract_nested_fields_from_mapping(mapping_properties: dict[str, Any]) -> set[str]: + return {name for name, defn in mapping_properties.items() if defn.get("type") == "nested"} + + def _populate_nested_fields_from_mapping(self, mapping_properties: dict[str, Any]) -> None: + live_nested = self._extract_nested_fields_from_mapping(mapping_properties) + self._resolved_nested_fields.update(live_nested) + + if self._nested_fields and self._nested_fields != "*": + declared_but_not_nested = set(self._nested_fields) - live_nested + for field in declared_but_not_nested: + logger.warning( + "Field '{field}' was declared as nested but is not mapped as nested in the existing " + "index '{index}'. A full re-index is required to change a field from object to nested.", + field=field, + index=self._index, + ) + + @staticmethod + def _detect_nested_fields_from_documents(documents: list[Document]) -> set[str]: + nested: set[str] = set() + for doc in documents: + if not doc.meta: + continue + for key, value in doc.meta.items(): + if isinstance(value, list) and value and all(isinstance(elem, dict) for elem in value): + nested.add(key) + return nested + + def _update_mapping_for_nested_fields(self, new_nested_fields: set[str]) -> None: + assert self._client is not None + for field in new_nested_fields: + try: + self._client.indices.put_mapping( + index=self._index, + body={"properties": {field: {"type": "nested"}}}, + ) + self._resolved_nested_fields.add(field) + except RequestError as e: + logger.warning( + "Could not map field '{field}' as nested in index '{index}': {error}. " + "If it was previously mapped as object, a full re-index is required.", + field=field, + index=self._index, + error=str(e), + ) + + async def _update_mapping_for_nested_fields_async(self, new_nested_fields: set[str]) -> None: + assert self._async_client is not None + for field in new_nested_fields: + try: + await self._async_client.indices.put_mapping( + index=self._index, + body={"properties": {field: {"type": "nested"}}}, + ) + self._resolved_nested_fields.add(field) + except RequestError as e: + logger.warning( + "Could not map field '{field}' as nested in index '{index}': {error}. " + "If it was previously mapped as object, a full re-index is required.", + field=field, + index=self._index, + error=str(e), + ) + async def _ensure_index_exists_async(self) -> None: assert self._async_client is not None @@ -318,6 +399,9 @@ async def _ensure_index_exists_async(self) -> None: "`settings` values will be ignored.", index=self._index, ) + mapping = await self._async_client.indices.get_mapping(index=self._index) + properties = mapping[self._index]["mappings"].get("properties", {}) + self._populate_nested_fields_from_mapping(properties) elif self._create_index: # Create the index if it doesn't exist body = {"mappings": self._mappings, "settings": self._settings} @@ -332,6 +416,9 @@ def _ensure_index_exists(self) -> None: "`settings` values will be ignored.", index=self._index, ) + mapping = self._client.indices.get_mapping(index=self._index) + properties = mapping[self._index]["mappings"].get("properties", {}) + self._populate_nested_fields_from_mapping(properties) elif self._create_index: # Create the index if it doesn't exist body = {"mappings": self._mappings, "settings": self._settings} @@ -370,7 +457,9 @@ def _deserialize_search_hits(hits: list[dict[str, Any]]) -> list[Document]: def _prepare_filter_search_request(self, filters: dict[str, Any] | None) -> dict[str, Any]: search_kwargs: dict[str, Any] = {"size": 10_000} if filters: - search_kwargs["query"] = {"bool": {"filter": normalize_filters(filters)}} + search_kwargs["query"] = { + "bool": {"filter": normalize_filters(filters, nested_fields=self._resolved_nested_fields)} + } # For some applications not returning the embedding can save a lot of bandwidth # if you don't need this data not retrieving it can be a good idea @@ -521,6 +610,12 @@ def write_documents( """ self._ensure_initialized() + if self._nested_fields == "*" and documents: + detected_nested_fields = self._detect_nested_fields_from_documents(documents) + new_nested_fields = detected_nested_fields - self._resolved_nested_fields + if new_nested_fields: + self._update_mapping_for_nested_fields(new_nested_fields) + bulk_params = self._prepare_bulk_write_request( documents=documents, policy=policy, is_async=False, refresh=refresh ) @@ -548,6 +643,13 @@ async def write_documents_async( """ await self._ensure_initialized_async() assert self._async_client is not None + + if self._nested_fields == "*" and documents: + detected_nested_fields = self._detect_nested_fields_from_documents(documents) + new_nested_fields = detected_nested_fields - self._resolved_nested_fields + if new_nested_fields: + await self._update_mapping_for_nested_fields_async(new_nested_fields) + bulk_params = self._prepare_bulk_write_request( documents=documents, policy=policy, is_async=True, refresh=refresh ) @@ -757,7 +859,7 @@ def delete_by_filter(self, filters: dict[str, Any], refresh: bool = False) -> in assert self._client is not None try: - normalized_filters = normalize_filters(filters) + normalized_filters = normalize_filters(filters, nested_fields=self._resolved_nested_fields) body = {"query": {"bool": {"filter": normalized_filters}}} result = self._client.delete_by_query(index=self._index, body=body, refresh=refresh) @@ -787,7 +889,7 @@ async def delete_by_filter_async(self, filters: dict[str, Any], refresh: bool = assert self._async_client is not None try: - normalized_filters = normalize_filters(filters) + normalized_filters = normalize_filters(filters, nested_fields=self._resolved_nested_fields) body = {"query": {"bool": {"filter": normalized_filters}}} result = await self._async_client.delete_by_query(index=self._index, body=body, refresh=refresh) deleted_count = result.get("deleted", 0) @@ -817,7 +919,7 @@ def update_by_filter(self, filters: dict[str, Any], meta: dict[str, Any], refres assert self._client is not None try: - normalized_filters = normalize_filters(filters) + normalized_filters = normalize_filters(filters, nested_fields=self._resolved_nested_fields) # Build the update script to modify metadata fields # Documents are stored with flattened metadata, so update fields directly in ctx._source update_script_lines = [] @@ -857,7 +959,7 @@ async def update_by_filter_async(self, filters: dict[str, Any], meta: dict[str, assert self._async_client is not None try: - normalized_filters = normalize_filters(filters) + normalized_filters = normalize_filters(filters, nested_fields=self._resolved_nested_fields) # Build the update script to modify metadata fields # Documents are stored with flattened metadata, so update fields directly in ctx._source update_script_lines = [] @@ -894,14 +996,16 @@ def _prepare_bm25_search_request( if not query: body: dict[str, Any] = {"query": {"bool": {"must": {"match_all": {}}}}} if filters: - body["query"]["bool"]["filter"] = normalize_filters(filters) + body["query"]["bool"]["filter"] = normalize_filters(filters, nested_fields=self._resolved_nested_fields) if isinstance(custom_query, dict): body = self._render_custom_query( custom_query, { "$query": query, - "$filters": normalize_filters(filters) if filters else None, + "$filters": normalize_filters(filters, nested_fields=self._resolved_nested_fields) + if filters + else None, }, ) @@ -925,7 +1029,7 @@ def _prepare_bm25_search_request( } if filters: - body["query"]["bool"]["filter"] = normalize_filters(filters) + body["query"]["bool"]["filter"] = normalize_filters(filters, nested_fields=self._resolved_nested_fields) body["size"] = top_k @@ -1330,7 +1434,7 @@ def _metadata_search( # Add filters if provided if filters: - normalized_filters = normalize_filters(filters) + normalized_filters = normalize_filters(filters, nested_fields=self._resolved_nested_fields) self._apply_metadata_search_filters(os_query, [normalized_filters], mode) body = {"size": 1000, "query": os_query} @@ -1435,7 +1539,7 @@ async def _metadata_search_async( # Add filters if provided if filters: - normalized_filters = normalize_filters(filters) + normalized_filters = normalize_filters(filters, nested_fields=self._resolved_nested_fields) self._apply_metadata_search_filters(os_query, [normalized_filters], mode) body = {"size": 1000, "query": os_query} @@ -1476,7 +1580,9 @@ def _prepare_embedding_search_request( custom_query, { "$query_embedding": query_embedding, - "$filters": normalize_filters(filters) if filters else None, + "$filters": normalize_filters(filters, nested_fields=self._resolved_nested_fields) + if filters + else None, }, ) @@ -1501,9 +1607,13 @@ def _prepare_embedding_search_request( if filters: if efficient_filtering: - body["query"]["bool"]["must"][0]["knn"]["embedding"]["filter"] = normalize_filters(filters) + body["query"]["bool"]["must"][0]["knn"]["embedding"]["filter"] = normalize_filters( + filters, nested_fields=self._resolved_nested_fields + ) else: - body["query"]["bool"]["filter"] = normalize_filters(filters) + body["query"]["bool"]["filter"] = normalize_filters( + filters, nested_fields=self._resolved_nested_fields + ) body["size"] = top_k @@ -1609,7 +1719,7 @@ def count_documents_by_filter(self, filters: dict[str, Any]) -> int: self._ensure_initialized() assert self._client is not None - normalized_filters = normalize_filters(filters) + normalized_filters = normalize_filters(filters, nested_fields=self._resolved_nested_fields) body = {"query": {"bool": {"filter": normalized_filters}}} return self._client.count(index=self._index, body=body)["count"] @@ -1624,7 +1734,7 @@ async def count_documents_by_filter_async(self, filters: dict[str, Any]) -> int: await self._ensure_initialized_async() assert self._async_client is not None - normalized_filters = normalize_filters(filters) + normalized_filters = normalize_filters(filters, nested_fields=self._resolved_nested_fields) body = {"query": {"bool": {"filter": normalized_filters}}} return (await self._async_client.count(index=self._index, body=body))["count"] @@ -1645,13 +1755,12 @@ def _build_cardinality_aggregations(index_mapping: dict[str, Any], fields: list[ aggs[f"{field_name}_cardinality"] = {"cardinality": {"field": field_name}} return aggs - @staticmethod - def _build_distinct_values_query_body(filters: dict[str, Any] | None, aggs: dict[str, Any]) -> dict[str, Any]: + def _build_distinct_values_query_body(self, filters: dict[str, Any] | None, aggs: dict[str, Any]) -> dict[str, Any]: """ Builds the query body for distinct values counting with filters and aggregations. """ if filters: - normalized_filters = normalize_filters(filters) + normalized_filters = normalize_filters(filters, nested_fields=self._resolved_nested_fields) return { "query": {"bool": {"filter": normalized_filters}}, "aggs": aggs, diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/filters.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/filters.py index 30f0790420..f4375a7974 100644 --- a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/filters.py +++ b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/filters.py @@ -7,20 +7,105 @@ from haystack.errors import FilterError -def normalize_filters(filters: dict[str, Any]) -> dict[str, Any]: +def normalize_filters(filters: dict[str, Any], nested_fields: set[str] | None = None) -> dict[str, Any]: """ Converts Haystack filters in OpenSearch compatible filters. + + :param filters: Haystack filter dictionary. + :param nested_fields: Set of metadata field paths that are mapped as `nested` type in OpenSearch. + When provided, conditions targeting sub-fields of these paths are wrapped in `nested` queries. """ if not isinstance(filters, dict): msg = "Filters must be a dictionary" raise FilterError(msg) if "field" in filters: - return {"bool": {"must": _parse_comparison_condition(filters)}} - return _parse_logical_condition(filters) + parsed = _parse_comparison_condition(filters, nested_fields) + if nested_fields: + nested_path = _get_nested_path(filters, nested_fields) + if nested_path: + return {"bool": {"must": {"nested": {"path": nested_path, "query": parsed}}}} + return {"bool": {"must": parsed}} + return _parse_logical_condition(filters, nested_fields) + + +def _get_nested_path(condition: dict[str, Any], nested_fields: set[str]) -> str | None: + """Returns the nested path for a comparison condition, or None.""" + if not (field := condition.get("field")): + return None + if field.startswith("meta."): + field = field[5:] + parts = field.split(".") + for i in range(1, len(parts)): + prefix = ".".join(parts[:i]) + if prefix in nested_fields: + return prefix + return None + + +def _get_logical_condition_nested_path(condition: dict[str, Any], nested_fields: set[str]) -> str | None: + """Returns the common nested path if all leaf comparisons in a logical sub-group share one, else None.""" + paths: set[str | None] = set() + for c in condition.get("conditions", []): + if "field" in c: + paths.add(_get_nested_path(c, nested_fields)) + elif "operator" in c and "conditions" in c: + paths.add(_get_logical_condition_nested_path(c, nested_fields)) + else: + return None + if len(paths) == 1: + return next(iter(paths)) + return None + + +def _group_nested_conditions( + raw_conditions: list[dict[str, Any]], + nested_fields: set[str], + operator: str, +) -> list[dict[str, Any]]: + """ + Group conditions by nested path and wrap each group in a `nested` query. + Both direct comparisons and logical sub-groups whose leaves all target the + same nested path are absorbed into the group so that they match within the + same array element. + """ + nested_groups: dict[str, list[dict[str, Any]]] = {} + flat_raw: list[dict[str, Any]] = [] + + for c in raw_conditions: + nested_path = _get_nested_path(c, nested_fields) + if nested_path is None and "operator" in c and "conditions" in c: + nested_path = _get_logical_condition_nested_path(c, nested_fields) + if nested_path: + nested_groups.setdefault(nested_path, []).append(c) + else: + flat_raw.append(c) + + conditions = [_parse_comparison_condition(c, nested_fields) for c in flat_raw] + + for path, group in nested_groups.items(): + inner = [] + for c in group: + if "operator" in c and "conditions" in c: + # Logical sub-group: parse without nested awareness to avoid + # redundant nested wrapping — the outer code handles that. + inner.append(_parse_logical_condition(c, nested_fields=None)) + else: + inner.append(_parse_comparison_condition(c, nested_fields)) + if len(inner) > 1: + inner = _normalize_ranges(inner) + if len(inner) == 1: + conditions.append({"nested": {"path": path, "query": inner[0]}}) + elif operator == "OR": + conditions.append({"nested": {"path": path, "query": {"bool": {"should": inner}}}}) + else: + conditions.append({"nested": {"path": path, "query": {"bool": {"must": inner}}}}) -def _parse_logical_condition(condition: dict[str, Any]) -> dict[str, Any]: + return conditions + + +def _parse_logical_condition(condition: dict[str, Any], nested_fields: set[str] | None = None) -> dict[str, Any]: if "operator" not in condition: msg = f"'operator' key missing in {condition}" raise FilterError(msg) @@ -29,7 +114,12 @@ def _parse_logical_condition(condition: dict[str, Any]) -> dict[str, Any]: raise FilterError(msg) operator = condition["operator"] - conditions = [_parse_comparison_condition(c) for c in condition["conditions"]] + + if nested_fields: + conditions = _group_nested_conditions(condition["conditions"], nested_fields, operator) + else: + conditions = [_parse_comparison_condition(c, nested_fields) for c in condition["conditions"]] + if len(conditions) > 1: conditions = _normalize_ranges(conditions) if operator == "AND": @@ -189,11 +279,11 @@ def _not_in(field: str, value: Any) -> dict[str, Any]: } -def _parse_comparison_condition(condition: dict[str, Any]) -> dict[str, Any]: +def _parse_comparison_condition(condition: dict[str, Any], nested_fields: set[str] | None = None) -> dict[str, Any]: if "field" not in condition: # 'field' key is only found in comparison dictionaries. # We assume this is a logic dictionary since it's not present. - return _parse_logical_condition(condition) + return _parse_logical_condition(condition, nested_fields) field: str = condition["field"] if field.startswith("meta."): diff --git a/integrations/opensearch/tests/conftest.py b/integrations/opensearch/tests/conftest.py index 42f52de7eb..30a5f14325 100644 --- a/integrations/opensearch/tests/conftest.py +++ b/integrations/opensearch/tests/conftest.py @@ -143,6 +143,54 @@ def document_store_embedding_dim_4_no_emb_returned_faiss(): store._client.indices.delete(index=index, params={"ignore": [400, 404]}) +@pytest.fixture +def document_store_nested(): + """ + OpenSearch document store with explicit nested fields. + """ + hosts = ["https://localhost:9200"] + index = _get_unique_index_name() + + store = OpenSearchDocumentStore( + hosts=hosts, + index=index, + http_auth=("admin", "SecureHaystack!2026"), + verify_certs=False, + embedding_dim=768, + return_embedding=False, + nested_fields=["refs", "tags"], + ) + store._ensure_initialized() + yield store + + assert store._client + store._client.indices.delete(index=index, params={"ignore": [400, 404]}) + + +@pytest.fixture +def document_store_wildcard_nested(): + """ + OpenSearch document store with wildcard nested field auto-detection. + """ + hosts = ["https://localhost:9200"] + index = _get_unique_index_name() + + store = OpenSearchDocumentStore( + hosts=hosts, + index=index, + http_auth=("admin", "SecureHaystack!2026"), + verify_certs=False, + embedding_dim=768, + return_embedding=False, + nested_fields="*", + ) + store._ensure_initialized() + yield store + + assert store._client + store._client.indices.delete(index=index, params={"ignore": [400, 404]}) + + @pytest.fixture def test_documents(): return [ diff --git a/integrations/opensearch/tests/test_auth.py b/integrations/opensearch/tests/test_auth.py index 26e231620a..393e374e91 100644 --- a/integrations/opensearch/tests/test_auth.py +++ b/integrations/opensearch/tests/test_auth.py @@ -308,6 +308,7 @@ def test_ds_to_dict_basic_auth(self, _mock_opensearch_client): "use_ssl": None, "verify_certs": None, "timeout": None, + "nested_fields": None, }, } @@ -354,6 +355,7 @@ def test_ds_to_dict_aws_auth(self, _mock_opensearch_client, monkeypatch: pytest. "use_ssl": None, "verify_certs": None, "timeout": None, + "nested_fields": None, }, } diff --git a/integrations/opensearch/tests/test_bm25_retriever.py b/integrations/opensearch/tests/test_bm25_retriever.py index 7d1d302430..5181db75e6 100644 --- a/integrations/opensearch/tests/test_bm25_retriever.py +++ b/integrations/opensearch/tests/test_bm25_retriever.py @@ -63,6 +63,7 @@ def test_to_dict(_mock_opensearch_client): "use_ssl": None, "verify_certs": None, "timeout": None, + "nested_fields": None, }, "type": "haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore", }, diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index 7c6200d4ba..912a523c87 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -52,6 +52,7 @@ def test_to_dict(_mock_opensearch_client): "use_ssl": None, "verify_certs": None, "timeout": None, + "nested_fields": None, }, } @@ -373,6 +374,184 @@ async def test_bm25_retrieval_async_reraises_other_transport_errors(_mock_opense assert store._async_client.search.call_count == 1 +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") +def test_to_dict_with_nested_fields(_mock_opensearch_client): + document_store = OpenSearchDocumentStore(hosts="some hosts", nested_fields=["attributes", "tags"]) + res = document_store.to_dict() + assert res["init_parameters"]["nested_fields"] == ["attributes", "tags"] + props = res["init_parameters"]["mappings"]["properties"] + assert props["attributes"] == {"type": "nested"} + assert props["tags"] == {"type": "nested"} + + +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") +def test_from_dict_with_nested_fields(_mock_opensearch_client): + data = { + "type": "haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore", + "init_parameters": { + "hosts": "some hosts", + "index": "default", + "nested_fields": ["attributes"], + "http_auth": ("admin", "admin"), + }, + } + document_store = OpenSearchDocumentStore.from_dict(data) + assert document_store._nested_fields == ["attributes"] + assert document_store._resolved_nested_fields == {"attributes"} + + +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") +def test_detect_nested_fields_from_documents(_mock_opensearch_client): + docs = [ + Document( + content="doc1", + meta={ + "attributes": [{"color": "red"}, {"color": "blue"}], + "tags": [{"name": "sale"}], + "status": "active", + "scores": [1, 2, 3], + }, + ), + Document(content="doc2", meta={"other_nested": [{"key": "val"}]}), + Document(content="doc3"), + ] + result = OpenSearchDocumentStore._detect_nested_fields_from_documents(docs) + assert result == {"attributes", "tags", "other_nested"} + + +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") +def test_detect_nested_fields_empty_list(_mock_opensearch_client): + docs = [Document(content="doc1", meta={"items": []})] + result = OpenSearchDocumentStore._detect_nested_fields_from_documents(docs) + assert result == set() + + +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") +def test_extract_nested_fields_from_mapping(_mock_opensearch_client): + mapping_properties = { + "attributes": {"type": "nested"}, + "tags": {"type": "nested"}, + "content": {"type": "text"}, + "embedding": {"type": "knn_vector", "dimension": 768}, + "status": {"type": "keyword"}, + } + result = OpenSearchDocumentStore._extract_nested_fields_from_mapping(mapping_properties) + assert result == {"attributes", "tags"} + + +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") +def test_extract_nested_fields_from_mapping_none_nested(_mock_opensearch_client): + mapping_properties = { + "content": {"type": "text"}, + "status": {"type": "keyword"}, + } + result = OpenSearchDocumentStore._extract_nested_fields_from_mapping(mapping_properties) + assert result == set() + + +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") +def test_get_default_mappings_with_nested_fields(_mock_opensearch_client): + store = OpenSearchDocumentStore(hosts="testhost", nested_fields=["attributes", "tags"]) + props = store._mappings["properties"] + assert props["attributes"] == {"type": "nested"} + assert props["tags"] == {"type": "nested"} + assert props["content"] == {"type": "text"} + assert "embedding" in props + + +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") +def test_get_default_mappings_with_wildcard(_mock_opensearch_client): + store = OpenSearchDocumentStore(hosts="testhost", nested_fields="*") + props = store._mappings["properties"] + for val in props.values(): + assert val.get("type") != "nested" + + +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") +def test_populate_nested_fields_from_mapping(_mock_opensearch_client): + store = OpenSearchDocumentStore(hosts="testhost", nested_fields=["attributes"]) + assert store._resolved_nested_fields == {"attributes"} + + mapping_properties = { + "attributes": {"type": "nested"}, + "tags": {"type": "nested"}, + "content": {"type": "text"}, + } + store._populate_nested_fields_from_mapping(mapping_properties) + assert store._resolved_nested_fields == {"attributes", "tags"} + + +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") +def test_resolved_nested_fields_with_wildcard(_mock_opensearch_client): + store = OpenSearchDocumentStore(hosts="testhost", nested_fields="*") + assert store._resolved_nested_fields == set() + assert store._nested_fields == "*" + + +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") +@patch("haystack_integrations.document_stores.opensearch.document_store.bulk") +def test_wildcard_nested_fields_detected_on_write(mock_bulk, _mock_opensearch_client): + mock_bulk.return_value = (2, []) + store = OpenSearchDocumentStore(hosts="testhost", nested_fields="*") + store._client = MagicMock() + store._initialized = True + + docs = [ + Document( + content="doc1", + meta={ + "refs": [{"law": "bgb", "section": "1"}], + "tags": [{"name": "important"}], + "status": "active", + }, + ), + Document(content="doc2", meta={"refs": [{"law": "stgb"}]}), + ] + + store.write_documents(docs) + + assert "refs" in store._resolved_nested_fields + assert "tags" in store._resolved_nested_fields + assert "status" not in store._resolved_nested_fields + # put_mapping should have been called for each detected nested field + assert store._client.indices.put_mapping.call_count == 2 + + +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") +@patch("haystack_integrations.document_stores.opensearch.document_store.bulk") +def test_wildcard_nested_fields_incremental_detection(mock_bulk, _mock_opensearch_client): + mock_bulk.return_value = (1, []) + store = OpenSearchDocumentStore(hosts="testhost", nested_fields="*") + store._client = MagicMock() + store._initialized = True + + # First batch: detects "refs" + store.write_documents([Document(content="d1", meta={"refs": [{"law": "bgb"}]})]) + assert store._resolved_nested_fields == {"refs"} + assert store._client.indices.put_mapping.call_count == 1 + + # Second batch: detects "tags" (refs already known) + store.write_documents([Document(content="d2", meta={"refs": [{"law": "stgb"}], "tags": [{"name": "x"}]})]) + assert store._resolved_nested_fields == {"refs", "tags"} + assert store._client.indices.put_mapping.call_count == 2 # only one additional call for "tags" + + +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") +@patch("haystack_integrations.document_stores.opensearch.document_store.bulk") +def test_explicit_nested_fields_no_detection_on_write(mock_bulk, _mock_opensearch_client): + mock_bulk.return_value = (1, []) + store = OpenSearchDocumentStore(hosts="testhost", nested_fields=["refs"]) + store._client = MagicMock() + store._initialized = True + + store.write_documents([Document(content="d1", meta={"tags": [{"name": "x"}]})]) + + # "tags" should NOT be added because we used explicit nested_fields, not wildcard + assert "tags" not in store._resolved_nested_fields + assert store._resolved_nested_fields == {"refs"} + store._client.indices.put_mapping.assert_not_called() + + @pytest.mark.integration class TestDocumentStore( CountDocumentsByFilterTest, @@ -1103,3 +1282,227 @@ def test_query_sql(self, document_store: OpenSearchDocumentStore): invalid_query = "SELECT * FROM non_existent_index" with pytest.raises(DocumentStoreError, match="Failed to execute SQL query"): document_store._query_sql(invalid_query) + + def test_explicit_nested_fields_filter(self, document_store_nested: OpenSearchDocumentStore): + """Filtering on explicitly declared nested fields returns correct documents.""" + docs = [ + Document( + content="doc about bgb 1a", + meta={"refs": [{"law": "bgb", "section": "1", "paragraph": "a"}], "status": "active"}, + ), + Document( + content="doc about bgb 2", + meta={"refs": [{"law": "bgb", "section": "2"}], "status": "active"}, + ), + Document( + content="doc about stgb", + meta={"refs": [{"law": "stgb", "section": "1"}], "status": "active"}, + ), + ] + document_store_nested.write_documents(docs) + + # Filter for refs.law == bgb + results = document_store_nested.filter_documents( + filters={"field": "meta.refs.law", "operator": "==", "value": "bgb"} + ) + assert len(results) == 2 + assert all("bgb" in str(doc.meta["refs"]) for doc in results) + + def test_explicit_nested_fields_combined_filter(self, document_store_nested: OpenSearchDocumentStore): + """AND filter across sub-fields of the same nested path matches within the same array element.""" + docs = [ + Document( + content="bgb section 1", + meta={"refs": [{"law": "bgb", "section": "1"}, {"law": "stgb", "section": "2"}]}, + ), + Document( + content="bgb section 2", + meta={"refs": [{"law": "bgb", "section": "2"}]}, + ), + Document( + content="stgb section 1", + meta={"refs": [{"law": "stgb", "section": "1"}]}, + ), + ] + document_store_nested.write_documents(docs) + + # Filter: refs.law == bgb AND refs.section == 1 (must match within same nested object) + results = document_store_nested.filter_documents( + filters={ + "operator": "AND", + "conditions": [ + {"field": "meta.refs.law", "operator": "==", "value": "bgb"}, + {"field": "meta.refs.section", "operator": "==", "value": "1"}, + ], + } + ) + assert len(results) == 1 + assert results[0].content == "bgb section 1" + + def test_explicit_nested_fields_mixed_nested_and_flat(self, document_store_nested: OpenSearchDocumentStore): + """Filtering on both nested and flat fields works correctly.""" + docs = [ + Document(content="d1", meta={"refs": [{"law": "bgb"}], "status": "active"}), + Document(content="d2", meta={"refs": [{"law": "bgb"}], "status": "inactive"}), + Document(content="d3", meta={"refs": [{"law": "stgb"}], "status": "active"}), + ] + document_store_nested.write_documents(docs) + + results = document_store_nested.filter_documents( + filters={ + "operator": "AND", + "conditions": [ + {"field": "meta.refs.law", "operator": "==", "value": "bgb"}, + {"field": "meta.status", "operator": "==", "value": "active"}, + ], + } + ) + assert len(results) == 1 + assert results[0].content == "d1" + + def test_wildcard_nested_fields_auto_detection(self, document_store_wildcard_nested: OpenSearchDocumentStore): + """With nested_fields=['*'], writing docs with list[dict] metadata auto-detects and maps nested fields.""" + assert document_store_wildcard_nested._resolved_nested_fields == set() + + docs = [ + Document(content="d1", meta={"refs": [{"law": "bgb", "section": "1"}], "status": "active"}), + Document(content="d2", meta={"refs": [{"law": "stgb"}], "tags": [{"name": "important"}]}), + ] + document_store_wildcard_nested.write_documents(docs) + + # After writing, nested fields should be detected + assert "refs" in document_store_wildcard_nested._resolved_nested_fields + assert "tags" in document_store_wildcard_nested._resolved_nested_fields + assert "status" not in document_store_wildcard_nested._resolved_nested_fields + + def test_wildcard_nested_fields_filter(self, document_store_wildcard_nested: OpenSearchDocumentStore): + """With wildcard auto-detection, nested filtering works correctly.""" + docs = [ + Document(content="bgb section 1", meta={"refs": [{"law": "bgb", "section": "1"}]}), + Document(content="bgb section 2", meta={"refs": [{"law": "bgb", "section": "2"}]}), + Document(content="stgb section 1", meta={"refs": [{"law": "stgb", "section": "1"}]}), + ] + document_store_wildcard_nested.write_documents(docs) + + # Nested AND filter on refs.law + refs.section + results = document_store_wildcard_nested.filter_documents( + filters={ + "operator": "AND", + "conditions": [ + {"field": "meta.refs.law", "operator": "==", "value": "bgb"}, + {"field": "meta.refs.section", "operator": "==", "value": "1"}, + ], + } + ) + assert len(results) == 1 + assert results[0].content == "bgb section 1" + + def test_wildcard_nested_fields_incremental_detection( + self, document_store_wildcard_nested: OpenSearchDocumentStore + ): + """A second write batch discovers new nested fields not seen in the first batch.""" + # First batch: only refs + document_store_wildcard_nested.write_documents([Document(content="d1", meta={"refs": [{"law": "bgb"}]})]) + assert "refs" in document_store_wildcard_nested._resolved_nested_fields + assert "tags" not in document_store_wildcard_nested._resolved_nested_fields + + # Second batch: introduces tags + document_store_wildcard_nested.write_documents([Document(content="d2", meta={"tags": [{"name": "sale"}]})]) + assert "tags" in document_store_wildcard_nested._resolved_nested_fields + + # Both nested fields are now filterable + results = document_store_wildcard_nested.filter_documents( + filters={"field": "meta.refs.law", "operator": "==", "value": "bgb"} + ) + assert len(results) == 1 + assert results[0].content == "d1" + + results = document_store_wildcard_nested.filter_documents( + filters={"field": "meta.tags.name", "operator": "==", "value": "sale"} + ) + assert len(results) == 1 + assert results[0].content == "d2" + + def test_nested_fields_or_filter(self, document_store_nested: OpenSearchDocumentStore): + """OR filter on nested sub-fields works correctly.""" + docs = [ + Document(content="bgb doc", meta={"refs": [{"law": "bgb"}]}), + Document(content="stgb doc", meta={"refs": [{"law": "stgb"}]}), + Document(content="other doc", meta={"refs": [{"law": "zpo"}]}), + ] + document_store_nested.write_documents(docs) + + results = document_store_nested.filter_documents( + filters={ + "operator": "OR", + "conditions": [ + {"field": "meta.refs.law", "operator": "==", "value": "bgb"}, + {"field": "meta.refs.law", "operator": "==", "value": "stgb"}, + ], + } + ) + assert len(results) == 2 + contents = sorted([r.content for r in results]) + assert contents == ["bgb doc", "stgb doc"] + + def test_nested_fields_not_filter(self, document_store_nested: OpenSearchDocumentStore): + """NOT filter on nested sub-fields excludes matching documents.""" + docs = [ + Document( + content="bgb section 1", + meta={"refs": [{"law": "bgb", "section": "1"}]}, + ), + Document( + content="bgb section 2", + meta={"refs": [{"law": "bgb", "section": "2"}]}, + ), + Document( + content="stgb section 1", + meta={"refs": [{"law": "stgb", "section": "1"}]}, + ), + ] + document_store_nested.write_documents(docs) + + # NOT (refs.law == bgb AND refs.section == 1) — only the first doc has both in the same nested object + results = document_store_nested.filter_documents( + filters={ + "operator": "NOT", + "conditions": [ + {"field": "meta.refs.law", "operator": "==", "value": "bgb"}, + {"field": "meta.refs.section", "operator": "==", "value": "1"}, + ], + } + ) + assert len(results) == 2 + contents = sorted([r.content for r in results]) + assert contents == ["bgb section 2", "stgb section 1"] + + def test_nested_fields_different_paths_filter(self, document_store_nested: OpenSearchDocumentStore): + """AND filter across different nested paths works correctly.""" + docs = [ + Document( + content="both", + meta={"refs": [{"law": "bgb"}], "tags": [{"name": "important"}]}, + ), + Document( + content="refs only", + meta={"refs": [{"law": "bgb"}], "tags": [{"name": "unimportant"}]}, + ), + Document( + content="tags only", + meta={"refs": [{"law": "stgb"}], "tags": [{"name": "important"}]}, + ), + ] + document_store_nested.write_documents(docs) + + results = document_store_nested.filter_documents( + filters={ + "operator": "AND", + "conditions": [ + {"field": "meta.refs.law", "operator": "==", "value": "bgb"}, + {"field": "meta.tags.name", "operator": "==", "value": "important"}, + ], + } + ) + assert len(results) == 1 + assert results[0].content == "both" diff --git a/integrations/opensearch/tests/test_document_store_async.py b/integrations/opensearch/tests/test_document_store_async.py index e413c06fa4..4fcefcbfbe 100644 --- a/integrations/opensearch/tests/test_document_store_async.py +++ b/integrations/opensearch/tests/test_document_store_async.py @@ -908,6 +908,64 @@ async def test_query_sql_async_with_fetch_size(self, document_store: OpenSearchD assert len(result["datarows"]) <= 5 assert result.get("cursor") is not None + @pytest.mark.asyncio + async def test_explicit_nested_fields_filter(self, document_store_nested: OpenSearchDocumentStore): + """Filtering on explicitly declared nested fields returns correct documents (async).""" + docs = [ + Document( + content="doc about bgb 1a", + meta={"refs": [{"law": "bgb", "section": "1", "paragraph": "a"}], "status": "active"}, + ), + Document( + content="doc about bgb 2", + meta={"refs": [{"law": "bgb", "section": "2"}], "status": "active"}, + ), + Document( + content="doc about stgb", + meta={"refs": [{"law": "stgb", "section": "1"}], "status": "active"}, + ), + ] + await document_store_nested.write_documents_async(docs) + + results = await document_store_nested.filter_documents_async( + filters={"field": "meta.refs.law", "operator": "==", "value": "bgb"} + ) + assert len(results) == 2 + assert all("bgb" in str(doc.meta["refs"]) for doc in results) + + @pytest.mark.asyncio + async def test_explicit_nested_fields_combined_filter(self, document_store_nested: OpenSearchDocumentStore): + """AND filter across sub-fields of the same nested path matches within the same array element (async).""" + docs = [ + Document( + content="bgb section 1", + meta={"refs": [{"law": "bgb", "section": "1"}, {"law": "stgb", "section": "2"}]}, + ), + Document( + content="bgb section 2", + meta={"refs": [{"law": "bgb", "section": "2"}]}, + ), + Document( + content="stgb section 1", + meta={"refs": [{"law": "stgb", "section": "1"}]}, + ), + ] + await document_store_nested.write_documents_async(docs) + + # Filter: refs.law == bgb AND refs.section == 1 (must match within same nested object) + results = await document_store_nested.filter_documents_async( + filters={ + "operator": "AND", + "conditions": [ + {"field": "meta.refs.law", "operator": "==", "value": "bgb"}, + {"field": "meta.refs.section", "operator": "==", "value": "1"}, + ], + } + ) + assert len(results) == 1 + assert results[0].content == "bgb section 1" + + @pytest.mark.integration @pytest.mark.asyncio async def test_query_sql_async_pagination_flow(self, document_store: OpenSearchDocumentStore): """Test async pagination flow with fetch_size""" diff --git a/integrations/opensearch/tests/test_embedding_retriever.py b/integrations/opensearch/tests/test_embedding_retriever.py index 2d33cbd308..dc35a1ed9d 100644 --- a/integrations/opensearch/tests/test_embedding_retriever.py +++ b/integrations/opensearch/tests/test_embedding_retriever.py @@ -79,6 +79,7 @@ def test_to_dict(_mock_opensearch_client): "use_ssl": None, "verify_certs": None, "timeout": None, + "nested_fields": None, }, "type": "haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore", }, diff --git a/integrations/opensearch/tests/test_filters.py b/integrations/opensearch/tests/test_filters.py index 9ac2cf95d3..03cf664567 100644 --- a/integrations/opensearch/tests/test_filters.py +++ b/integrations/opensearch/tests/test_filters.py @@ -8,7 +8,12 @@ from haystack.errors import FilterError from haystack.testing.document_store import FilterDocumentsTest -from haystack_integrations.document_stores.opensearch.filters import _normalize_ranges, normalize_filters +from haystack_integrations.document_stores.opensearch.filters import ( + _get_logical_condition_nested_path, + _get_nested_path, + _normalize_ranges, + normalize_filters, +) filters_data = [ ( @@ -181,6 +186,276 @@ ] +# (Haystack filters, nested fields, OpenSearch filters) +nested_filters_data = [ + # Single condition on a nested sub-field (top-level comparison, no logical wrapper) + ( + {"field": "meta.refs.law", "operator": "==", "value": "bgb"}, + {"refs"}, + {"bool": {"must": {"nested": {"path": "refs", "query": {"term": {"refs.law": "bgb"}}}}}}, + ), + # AND of conditions on the same nested path -> single nested query + ( + { + "operator": "AND", + "conditions": [ + {"field": "meta.refs.law", "operator": "==", "value": "bgb"}, + {"field": "meta.refs.section", "operator": "==", "value": "1"}, + ], + }, + {"refs"}, + { + "bool": { + "must": [ + { + "nested": { + "path": "refs", + "query": { + "bool": { + "must": [ + {"term": {"refs.law": "bgb"}}, + {"term": {"refs.section": "1"}}, + ] + } + }, + } + } + ] + } + }, + ), + # OR of conditions on the same nested path + ( + { + "operator": "OR", + "conditions": [ + {"field": "meta.refs.law", "operator": "==", "value": "bgb"}, + {"field": "meta.refs.law", "operator": "==", "value": "stgb"}, + ], + }, + {"refs"}, + { + "bool": { + "should": [ + { + "nested": { + "path": "refs", + "query": { + "bool": { + "should": [ + {"term": {"refs.law": "bgb"}}, + {"term": {"refs.law": "stgb"}}, + ] + } + }, + } + } + ] + } + }, + ), + # Mixed: some conditions nested, some flat + ( + { + "operator": "AND", + "conditions": [ + {"field": "meta.refs.law", "operator": "==", "value": "bgb"}, + {"field": "meta.status", "operator": "==", "value": "active"}, + ], + }, + {"refs"}, + { + "bool": { + "must": [ + {"term": {"status": "active"}}, + {"nested": {"path": "refs", "query": {"term": {"refs.law": "bgb"}}}}, + ] + } + }, + ), + # Conditions on different nested paths + ( + { + "operator": "AND", + "conditions": [ + {"field": "meta.refs.law", "operator": "==", "value": "bgb"}, + {"field": "meta.tags.name", "operator": "==", "value": "important"}, + ], + }, + {"refs", "tags"}, + { + "bool": { + "must": [ + {"nested": {"path": "refs", "query": {"term": {"refs.law": "bgb"}}}}, + {"nested": {"path": "tags", "query": {"term": {"tags.name": "important"}}}}, + ] + } + }, + ), + # Logical sub-group (OR) on the same nested path -> absorbed into single nested query + ( + { + "operator": "AND", + "conditions": [ + {"field": "meta.refs.law", "operator": "==", "value": "bgb"}, + {"field": "meta.refs.section", "operator": "==", "value": "1"}, + { + "operator": "OR", + "conditions": [ + {"field": "meta.refs.paragraph", "operator": "==", "value": "a"}, + {"field": "meta.refs.paragraph", "operator": "==", "value": "b"}, + ], + }, + ], + }, + {"refs"}, + { + "bool": { + "must": [ + { + "nested": { + "path": "refs", + "query": { + "bool": { + "must": [ + {"term": {"refs.law": "bgb"}}, + {"term": {"refs.section": "1"}}, + { + "bool": { + "should": [ + {"term": {"refs.paragraph": "a"}}, + {"term": {"refs.paragraph": "b"}}, + ] + } + }, + ] + } + }, + } + } + ] + } + }, + ), + # Logical sub-group mixing nested paths -> NOT absorbed, each gets its own nested query + ( + { + "operator": "AND", + "conditions": [ + {"field": "meta.refs.law", "operator": "==", "value": "bgb"}, + { + "operator": "OR", + "conditions": [ + {"field": "meta.refs.section", "operator": "==", "value": "1"}, + {"field": "meta.tags.name", "operator": "==", "value": "important"}, + ], + }, + ], + }, + {"refs", "tags"}, + { + "bool": { + "must": [ + { + "bool": { + "should": [ + {"nested": {"path": "refs", "query": {"term": {"refs.section": "1"}}}}, + {"nested": {"path": "tags", "query": {"term": {"tags.name": "important"}}}}, + ] + } + }, + {"nested": {"path": "refs", "query": {"term": {"refs.law": "bgb"}}}}, + ] + } + }, + ), + # NOT of conditions on the same nested path + ( + { + "operator": "NOT", + "conditions": [ + {"field": "meta.refs.law", "operator": "==", "value": "bgb"}, + {"field": "meta.refs.section", "operator": "==", "value": "1"}, + ], + }, + {"refs"}, + { + "bool": { + "must_not": [ + { + "bool": { + "must": [ + { + "nested": { + "path": "refs", + "query": { + "bool": { + "must": [ + {"term": {"refs.law": "bgb"}}, + {"term": {"refs.section": "1"}}, + ] + } + }, + } + } + ] + } + } + ] + } + }, + ), + # NOT with mixed nested and flat fields + ( + { + "operator": "NOT", + "conditions": [ + {"field": "meta.refs.law", "operator": "==", "value": "bgb"}, + {"field": "meta.status", "operator": "==", "value": "active"}, + ], + }, + {"refs"}, + { + "bool": { + "must_not": [ + { + "bool": { + "must": [ + {"term": {"status": "active"}}, + {"nested": {"path": "refs", "query": {"term": {"refs.law": "bgb"}}}}, + ] + } + } + ] + } + }, + ), + # Range conditions on the same nested sub-field -> merged inside nested query + ( + { + "operator": "AND", + "conditions": [ + {"field": "meta.refs.score", "operator": ">=", "value": 5}, + {"field": "meta.refs.score", "operator": "<", "value": 10}, + ], + }, + {"refs"}, + { + "bool": { + "must": [ + { + "nested": { + "path": "refs", + "query": {"range": {"refs.score": {"gte": 5, "lt": 10}}}, + } + } + ] + } + }, + ), +] + + @pytest.mark.parametrize("filters, expected", filters_data) def test_normalize_filters(filters, expected): result = normalize_filters(filters) @@ -225,6 +500,89 @@ def test_normalize_ranges(): ] +@pytest.mark.parametrize("filters, nested_fields, expected", nested_filters_data) +def test_normalize_filters_with_nested_fields(filters, nested_fields, expected): + result = normalize_filters(filters, nested_fields=nested_fields) + assert result == expected + + +class TestGetConditionNestedPath: + def test_with_meta_prefix(self): + condition = {"field": "meta.refs.law", "operator": "==", "value": "bgb"} + assert _get_nested_path(condition, {"refs"}) == "refs" + + def test_without_meta_prefix(self): + condition = {"field": "refs.law", "operator": "==", "value": "bgb"} + assert _get_nested_path(condition, {"refs"}) == "refs" + + def test_deeply_nested(self): + condition = {"field": "meta.a.b.c", "operator": "==", "value": "x"} + assert _get_nested_path(condition, {"a.b"}) == "a.b" + + def test_field_is_nested_path_itself(self): + """The field itself is the nested path -- not a sub-field, so returns None.""" + condition = {"field": "meta.refs", "operator": "==", "value": "x"} + assert _get_nested_path(condition, {"refs"}) is None + + def test_no_nested_match(self): + condition = {"field": "meta.status", "operator": "==", "value": "active"} + assert _get_nested_path(condition, {"refs"}) is None + + def test_empty_nested_fields(self): + condition = {"field": "meta.refs.law", "operator": "==", "value": "bgb"} + assert _get_nested_path(condition, set()) is None + + +class TestGetLogicalConditionNestedPath: + def test_all_same_nested_path(self): + condition = { + "operator": "OR", + "conditions": [ + {"field": "meta.refs.law", "operator": "==", "value": "bgb"}, + {"field": "meta.refs.section", "operator": "==", "value": "1"}, + ], + } + assert _get_logical_condition_nested_path(condition, {"refs"}) == "refs" + + def test_mixed_nested_paths(self): + condition = { + "operator": "OR", + "conditions": [ + {"field": "meta.refs.law", "operator": "==", "value": "bgb"}, + {"field": "meta.tags.name", "operator": "==", "value": "important"}, + ], + } + assert _get_logical_condition_nested_path(condition, {"refs", "tags"}) is None + + def test_nested_logical_subgroup(self): + """Deeply nested logical groups that all target the same path.""" + condition = { + "operator": "AND", + "conditions": [ + { + "operator": "OR", + "conditions": [ + {"field": "meta.refs.law", "operator": "==", "value": "bgb"}, + {"field": "meta.refs.law", "operator": "==", "value": "stgb"}, + ], + }, + {"field": "meta.refs.section", "operator": "==", "value": "1"}, + ], + } + assert _get_logical_condition_nested_path(condition, {"refs"}) == "refs" + + def test_some_non_nested(self): + """If some leaves are not on a nested path, returns None.""" + condition = { + "operator": "OR", + "conditions": [ + {"field": "meta.refs.law", "operator": "==", "value": "bgb"}, + {"field": "meta.status", "operator": "==", "value": "active"}, + ], + } + assert _get_logical_condition_nested_path(condition, {"refs"}) is None + + @pytest.mark.integration class TestFilters(FilterDocumentsTest): def assert_documents_are_equal(self, received: list[Document], expected: list[Document]): diff --git a/integrations/opensearch/tests/test_open_search_hybrid_retriever.py b/integrations/opensearch/tests/test_open_search_hybrid_retriever.py index 110b1ccf2d..201eb06044 100644 --- a/integrations/opensearch/tests/test_open_search_hybrid_retriever.py +++ b/integrations/opensearch/tests/test_open_search_hybrid_retriever.py @@ -53,6 +53,7 @@ class TestOpenSearchHybridRetriever: "use_ssl": None, "verify_certs": None, "timeout": None, + "nested_fields": None, }, }, "embedder": {