test: Azure AI Search - add unit tests (#3201)

anakin87 · web-flow · commit ead7d68f9d88 · 2026-04-22T11:04:44.000+02:00
diff --git a/integrations/azure_ai_search/tests/test_bm25_retriever.py b/integrations/azure_ai_search/tests/test_bm25_retriever.py
@@ -149,6 +149,19 @@ def test_run_time_params():
     assert res["documents"][0].content == "Test doc"
 
 
+def test_init_raises_type_error_on_invalid_document_store():
+    with pytest.raises(TypeError, match="document_store must be an instance of AzureAISearchDocumentStore"):
+        AzureAISearchBM25Retriever(document_store=object())
+
+
+def test_run_raises_runtime_error_when_retrieval_fails():
+    mock_store = Mock(spec=AzureAISearchDocumentStore)
+    mock_store._bm25_retrieval.side_effect = RuntimeError("boom")
+    retriever = AzureAISearchBM25Retriever(document_store=mock_store)
+    with pytest.raises(RuntimeError, match="bm25 retrieval process"):
+        retriever.run(query="Test query")
+
+
 @pytest.mark.skipif(
     not os.environ.get("AZURE_AI_SEARCH_ENDPOINT", None) and not os.environ.get("AZURE_AI_SEARCH_API_KEY", None),
     reason="Missing AZURE_AI_SEARCH_ENDPOINT or AZURE_AI_SEARCH_API_KEY.",
diff --git a/integrations/azure_ai_search/tests/test_document_store.py b/integrations/azure_ai_search/tests/test_document_store.py
@@ -10,6 +10,7 @@
 
 import pytest
 from azure.core.credentials import TokenCredential
+from azure.core.exceptions import ResourceNotFoundError
 from azure.search.documents.indexes.models import (
     CustomAnalyzer,
     SearchableField,
@@ -336,6 +337,128 @@ def test_query_sql_raises_not_implemented():
         document_store.query_sql("SELECT * FROM test-index")
 
 
+@pytest.mark.parametrize(
+    "metadata_fields, expected_error_match",
+    [
+        (
+            {"Title": SearchField(name="mismatched", type="Edm.String", filterable=True)},
+            "Name of SearchField",
+        ),
+        ({"Pages": object}, "Unsupported field type"),
+    ],
+)
+def test_normalize_metadata_index_fields_raises(metadata_fields, expected_error_match):
+    with pytest.raises(ValueError, match=expected_error_match):
+        AzureAISearchDocumentStore._normalize_metadata_index_fields(metadata_fields)
+
+
+def test_normalize_metadata_index_fields_skips_non_alpha_keys(caplog):
+    with caplog.at_level(logging.WARNING):
+        normalized = AzureAISearchDocumentStore._normalize_metadata_index_fields({"1invalid": str, "valid": int})
+    assert "valid" in normalized
+    assert "1invalid" not in normalized
+    assert "Invalid key" in caplog.text
+
+
+def test_normalize_metadata_index_fields_returns_empty_for_none():
+    assert AzureAISearchDocumentStore._normalize_metadata_index_fields(None) == {}
+
+
+@pytest.mark.parametrize(
+    "method, kwargs, expected_match",
+    [
+        ("_bm25_retrieval", {"query": None}, "query must not be None"),
+        ("_hybrid_retrieval", {"query": None, "query_embedding": [0.1]}, "query must not be None"),
+        ("_hybrid_retrieval", {"query": "q", "query_embedding": []}, "query_embedding must be a non-empty"),
+        ("_embedding_retrieval", {"query_embedding": []}, "query_embedding must be a non-empty"),
+    ],
+)
+def test_internal_retrieval_validates_inputs(method, kwargs, expected_match):
+    document_store = AzureAISearchDocumentStore(
+        api_key=Secret.from_token("fake-api-key"),
+        azure_endpoint=Secret.from_token("fake-endpoint"),
+        index_name="test-index",
+    )
+    with pytest.raises(ValueError, match=expected_match):
+        getattr(document_store, method)(**kwargs)
+
+
+def test_collect_unique_values_combines_lists_and_scalars():
+    docs = [
+        {"tags": ["a", "b"]},
+        {"tags": "c"},
+        {"tags": None},
+        {"tags": ["a", "d"]},
+    ]
+    assert AzureAISearchDocumentStore._collect_unique_values(docs, "tags") == {"a", "b", "c", "d"}
+
+
+@pytest.mark.parametrize(
+    "docs, expected",
+    [
+        ([], {"min": None, "max": None}),
+        ([{"x": None}, {"x": [1, 2]}], {"min": None, "max": None}),
+        ([{"x": 3}, {"x": 1}, {"x": 2}], {"min": 1, "max": 3}),
+    ],
+)
+def test_get_min_max_from_documents(docs, expected):
+    assert AzureAISearchDocumentStore._get_min_max_from_documents(docs, "x") == expected
+
+
+@pytest.mark.parametrize(
+    "field, expected_type",
+    [
+        (SimpleField(name="cat", type=SearchFieldDataType.String, filterable=True), "keyword"),
+        (SearchableField(name="content", type=SearchFieldDataType.String), "text"),
+        (SearchableField(name="title", type=SearchFieldDataType.String), "text"),
+        (SimpleField(name="year", type=SearchFieldDataType.Int32, filterable=True), "long"),
+        (SimpleField(name="rating", type=SearchFieldDataType.Double, filterable=True), "double"),
+        (
+            SearchField(
+                name="tags",
+                type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+                filterable=True,
+            ),
+            "keyword",
+        ),
+        (SimpleField(name="when", type=SearchFieldDataType.DateTimeOffset, filterable=True), "date"),
+    ],
+)
+def test_map_azure_field_type_variants(field, expected_type):
+    assert AzureAISearchDocumentStore._map_azure_field_type(field) == expected_type
+
+
+def test_map_azure_field_type_without_type_attribute():
+    field = Mock(spec=[])
+    field.name = "custom"
+    assert AzureAISearchDocumentStore._map_azure_field_type(field) == "keyword"
+
+
+def test_index_exists_raises_without_index_name():
+    document_store = AzureAISearchDocumentStore(
+        api_key=Secret.from_token("fake-api-key"),
+        azure_endpoint=Secret.from_token("fake-endpoint"),
+        index_name="test-index",
+    )
+    document_store._index_client = Mock()
+    with pytest.raises(ValueError, match="Index name is required"):
+        document_store._index_exists(None)
+
+
+def test_get_raw_documents_by_id_skips_not_found(caplog):
+    store, search_client, _ = _build_mock_document_store_with_schema(
+        [SimpleField(name="id", type=SearchFieldDataType.String, key=True, filterable=True)]
+    )
+    search_client.get_document.side_effect = [
+        {"id": "1", "content": "c1"},
+        ResourceNotFoundError("not found"),
+    ]
+    with caplog.at_level(logging.WARNING):
+        result = store._get_raw_documents_by_id(["1", "missing"])
+    assert result == [{"id": "1", "content": "c1"}]
+    assert "missing" in caplog.text
+
+
 def _assert_documents_are_equal(received: list[Document], expected: list[Document]):
     """
     Assert that two lists of Documents are equal.
diff --git a/integrations/azure_ai_search/tests/test_embedding_retriever.py b/integrations/azure_ai_search/tests/test_embedding_retriever.py
@@ -162,6 +162,19 @@ def test_run_time_params():
     assert res["documents"][0].embedding == [0.1, 0.2]
 
 
+def test_init_raises_on_invalid_document_store():
+    with pytest.raises(Exception, match="document_store must be an instance of AzureAISearchDocumentStore"):
+        AzureAISearchEmbeddingRetriever(document_store=object())
+
+
+def test_run_raises_runtime_error_when_retrieval_fails():
+    mock_store = Mock(spec=AzureAISearchDocumentStore)
+    mock_store._embedding_retrieval.side_effect = RuntimeError("boom")
+    retriever = AzureAISearchEmbeddingRetriever(document_store=mock_store)
+    with pytest.raises(RuntimeError, match="embedding retrieval process"):
+        retriever.run(query_embedding=[0.1, 0.2])
+
+
 @pytest.mark.skipif(
     not os.environ.get("AZURE_AI_SEARCH_ENDPOINT", None) and not os.environ.get("AZURE_AI_SEARCH_API_KEY", None),
     reason="Missing AZURE_AI_SEARCH_ENDPOINT or AZURE_AI_SEARCH_API_KEY.",
diff --git a/integrations/azure_ai_search/tests/test_filters_unit.py b/integrations/azure_ai_search/tests/test_filters_unit.py
@@ -0,0 +1,124 @@
+# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from haystack_integrations.document_stores.azure_ai_search.errors import AzureAISearchDocumentStoreFilterError
+from haystack_integrations.document_stores.azure_ai_search.filters import _normalize_filters
+
+
+@pytest.mark.parametrize(
+    "filters, expected",
+    [
+        ({"field": "meta.name", "operator": "==", "value": "alice"}, "name eq 'alice'"),
+        ({"field": "meta.active", "operator": "==", "value": True}, "active eq true"),
+        ({"field": "meta.count", "operator": "==", "value": 3}, "count eq 3"),
+        ({"field": "meta.name", "operator": "==", "value": None}, "name eq null"),
+        ({"field": "meta.name", "operator": "!=", "value": "alice"}, "not (name eq 'alice')"),
+        ({"field": "meta.active", "operator": "!=", "value": False}, "not (active eq false)"),
+        ({"field": "meta.count", "operator": "!=", "value": 3}, "not (count eq 3)"),
+        (
+            {"field": "meta.page", "operator": "in", "value": ["1", "2"]},
+            "search.in(page,'1,2',',')",
+        ),
+        ({"field": "meta.count", "operator": ">", "value": 5}, "count gt 5"),
+        ({"field": "meta.count", "operator": ">=", "value": 5}, "count ge 5"),
+        ({"field": "meta.count", "operator": "<", "value": 5}, "count lt 5"),
+        ({"field": "meta.count", "operator": "<=", "value": 5}, "count le 5"),
+        (
+            {"field": "meta.date", "operator": ">", "value": "2020-01-01T00:00:00Z"},
+            "date gt 2020-01-01T00:00:00Z",
+        ),
+        ({"field": "bare_field", "operator": "==", "value": "x"}, "bare_field eq 'x'"),
+    ],
+)
+def test_normalize_filters_comparison_conditions(filters, expected):
+    assert _normalize_filters(filters) == expected
+
+
+@pytest.mark.parametrize(
+    "filters, expected",
+    [
+        (
+            {
+                "operator": "AND",
+                "conditions": [
+                    {"field": "meta.name", "operator": "==", "value": "alice"},
+                    {"field": "meta.count", "operator": ">=", "value": 1},
+                ],
+            },
+            "(name eq 'alice') and (count ge 1)",
+        ),
+        (
+            {
+                "operator": "OR",
+                "conditions": [
+                    {"field": "meta.name", "operator": "==", "value": "alice"},
+                    {"field": "meta.name", "operator": "==", "value": "bob"},
+                ],
+            },
+            "(name eq 'alice') or (name eq 'bob')",
+        ),
+        (
+            {
+                "operator": "NOT",
+                "conditions": [{"field": "meta.name", "operator": "==", "value": "alice"}],
+            },
+            "not ((name eq 'alice'))",
+        ),
+        (
+            {
+                "operator": "AND",
+                "conditions": [
+                    {"field": "meta.name", "operator": "==", "value": "alice"},
+                    {
+                        "operator": "OR",
+                        "conditions": [
+                            {"field": "meta.count", "operator": ">", "value": 1},
+                            {"field": "meta.count", "operator": "<", "value": 10},
+                        ],
+                    },
+                ],
+            },
+            "(name eq 'alice') and ((count gt 1) or (count lt 10))",
+        ),
+    ],
+)
+def test_normalize_filters_logical_conditions(filters, expected):
+    assert _normalize_filters(filters) == expected
+
+
+@pytest.mark.parametrize(
+    "filters, expected_match",
+    [
+        ("not a dict", "Filters must be a dictionary"),
+        ({"operator": "AND"}, "Missing key"),
+        ({"conditions": []}, "Missing key"),
+        (
+            {"operator": "XOR", "conditions": [{"field": "a", "operator": "==", "value": 1}]},
+            "Unknown operator XOR",
+        ),
+        ({"field": "f"}, "Missing key"),
+        ({"field": "f", "operator": "???", "value": 1}, "Unknown operator"),
+        (
+            {"field": "f", "operator": ">", "value": "not-a-date"},
+            "Invalid value type",
+        ),
+        (
+            {"field": "f", "operator": ">", "value": [1, 2]},
+            "Invalid value type",
+        ),
+        (
+            {"field": "f", "operator": "in", "value": "not-a-list"},
+            "only supports a list of strings",
+        ),
+        (
+            {"field": "f", "operator": "in", "value": [1, 2]},
+            "only supports a list of strings",
+        ),
+    ],
+)
+def test_normalize_filters_raises_on_invalid_input(filters, expected_match):
+    with pytest.raises(AzureAISearchDocumentStoreFilterError, match=expected_match):
+        _normalize_filters(filters)
diff --git a/integrations/azure_ai_search/tests/test_hybrid_retriever.py b/integrations/azure_ai_search/tests/test_hybrid_retriever.py
@@ -168,6 +168,19 @@ def test_run_time_params():
     assert res["documents"][0].embedding == [0.1, 0.2]
 
 
+def test_init_raises_type_error_on_invalid_document_store():
+    with pytest.raises(TypeError, match="document_store must be an instance of AzureAISearchDocumentStore"):
+        AzureAISearchHybridRetriever(document_store=object())
+
+
+def test_run_raises_runtime_error_when_retrieval_fails():
+    mock_store = Mock(spec=AzureAISearchDocumentStore)
+    mock_store._hybrid_retrieval.side_effect = RuntimeError("boom")
+    retriever = AzureAISearchHybridRetriever(document_store=mock_store)
+    with pytest.raises(RuntimeError, match="hybrid retrieval process"):
+        retriever.run(query="Test query", query_embedding=[0.1, 0.2])
+
+
 @pytest.mark.skipif(
     not os.environ.get("AZURE_AI_SEARCH_ENDPOINT", None) and not os.environ.get("AZURE_AI_SEARCH_API_KEY", None),
     reason="Missing AZURE_AI_SEARCH_ENDPOINT or AZURE_AI_SEARCH_API_KEY.",