Skip to content

Commit b735b76

Browse files
authored
Merge branch 'main' into move-github-components
2 parents ca977a1 + fa8b9e4 commit b735b76

23 files changed

Lines changed: 1161 additions & 107 deletions

File tree

integrations/azure_ai_search/CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# Changelog
22

3+
## [integrations/azure_ai_search-v2.0.1] - 2025-04-16
4+
5+
### 🚀 Features
6+
7+
- Fix serialization errors in `AzureAIDocumentStore` (#1647)
8+
9+
310
## [integrations/azure_ai_search-v2.0.0] - 2025-04-03
411

512
### 🐛 Bug Fixes
@@ -21,6 +28,7 @@
2128

2229
## [integrations/azure_ai_search-v0.1.3] - 2025-02-17
2330

31+
2432
### 🧹 Chores
2533

2634
- Azure AI search - clarify that dataframe is not supported (#1407)

integrations/azure_ai_search/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
156156
[tool.pytest.ini_options]
157157
minversion = "6.0"
158158
markers = ["unit: unit tests", "integration: integration tests"]
159+
pythonpath = ["src"]
159160

160161
[[tool.mypy.overrides]]
161162
module = ["haystack.*", "haystack_integrations.*", "pytest.*", "azure.identity.*", "mypy.*", "azure.core.*", "azure.search.documents.*"]

integrations/azure_ai_search/src/haystack_integrations/document_stores/azure_ai_search/document_store.py

Lines changed: 93 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,21 +4,30 @@
44
import logging as python_logging
55
import os
66
from datetime import datetime
7-
from typing import Any, Dict, List, Optional
7+
from typing import Any, ClassVar, Dict, List, Optional
88

99
from azure.core.credentials import AzureKeyCredential
1010
from azure.core.exceptions import ClientAuthenticationError, HttpResponseError, ResourceNotFoundError
1111
from azure.identity import DefaultAzureCredential
1212
from azure.search.documents import SearchClient
1313
from azure.search.documents.indexes import SearchIndexClient
1414
from azure.search.documents.indexes.models import (
15+
CharFilter,
16+
CorsOptions,
1517
HnswAlgorithmConfiguration,
1618
HnswParameters,
19+
LexicalAnalyzer,
20+
LexicalTokenizer,
21+
ScoringProfile,
1722
SearchableField,
1823
SearchField,
1924
SearchFieldDataType,
2025
SearchIndex,
26+
SearchResourceEncryptionKey,
27+
SearchSuggester,
28+
SimilarityAlgorithm,
2129
SimpleField,
30+
TokenFilter,
2231
VectorSearch,
2332
VectorSearchAlgorithmMetric,
2433
VectorSearchProfile,
@@ -40,6 +49,19 @@
4049
datetime: "Edm.DateTimeOffset",
4150
}
4251

52+
# Map of expected field names to their corresponding classes
53+
AZURE_CLASS_MAPPING = {
54+
"suggesters": SearchSuggester,
55+
"analyzers": LexicalAnalyzer,
56+
"tokenizers": LexicalTokenizer,
57+
"token_filters": TokenFilter,
58+
"char_filters": CharFilter,
59+
"cors_options": CorsOptions,
60+
"similarity_algorithm": SimilarityAlgorithm,
61+
"encryption_key": SearchResourceEncryptionKey,
62+
"scoring_profiles": ScoringProfile,
63+
}
64+
4365
DEFAULT_VECTOR_SEARCH = VectorSearch(
4466
profiles=[
4567
VectorSearchProfile(name="default-vector-config", algorithm_configuration_name="cosine-algorithm-config")
@@ -60,6 +82,8 @@
6082

6183

6284
class AzureAISearchDocumentStore:
85+
TYPE_MAP: ClassVar[Dict[str, type]] = {"str": str, "int": int, "float": float, "bool": bool, "datetime": datetime}
86+
6387
def __init__(
6488
self,
6589
*,
@@ -138,7 +162,7 @@ def client(self) -> SearchClient:
138162
"The index '{idx_name}' does not exist. A new index will be created.",
139163
idx_name=self._index_name,
140164
)
141-
self._create_index(self._index_name)
165+
self._create_index()
142166
except (HttpResponseError, ClientAuthenticationError) as error:
143167
msg = f"Failed to authenticate with Azure Search: {error}"
144168
raise AzureAISearchDocumentStoreConfigError(msg) from error
@@ -154,11 +178,9 @@ def client(self) -> SearchClient:
154178

155179
return self._client
156180

157-
def _create_index(self, index_name: str) -> None:
181+
def _create_index(self) -> None:
158182
"""
159-
Creates a new search index.
160-
:param index_name: Name of the index to create. If None, the index name from the constructor is used.
161-
:param kwargs: Optional keyword parameters.
183+
Internally creates a new search index.
162184
"""
163185

164186
# default fields to create index based on Haystack Document (id, content, embedding)
@@ -175,19 +197,66 @@ def _create_index(self, index_name: str) -> None:
175197
),
176198
]
177199

178-
if not index_name:
179-
index_name = self._index_name
180200
if self._metadata_fields:
181201
default_fields.extend(self._create_metadata_index_fields(self._metadata_fields))
182202
index = SearchIndex(
183-
name=index_name,
203+
name=self._index_name,
184204
fields=default_fields,
185205
vector_search=self._vector_search_configuration,
186206
**self._index_creation_kwargs,
187207
)
188208
if self._index_client:
189209
self._index_client.create_index(index)
190210

211+
@classmethod
212+
def _deserialize_metadata_fields(cls, fields: Optional[Dict[str, str]]) -> Optional[Dict[str, type]]:
213+
"""Convert string representations back to type objects."""
214+
if not fields:
215+
return None
216+
try:
217+
# Use the class-level TYPE_MAP for conversion.
218+
ans = {key: cls.TYPE_MAP[value] for key, value in fields.items()}
219+
return ans
220+
except KeyError as e:
221+
msg = f"Unsupported type encountered in metadata_fields: {e}"
222+
raise ValueError(msg) from e
223+
224+
@staticmethod
225+
def _serialize_index_creation_kwargs(index_creation_kwargs: Dict[str, Any]) -> Dict[str, Any]:
226+
"""
227+
Serializes the index creation kwargs to a dictionary.
228+
This is needed to handle serialization of Azure AI Search classes
229+
that are passed in the index creation kwargs.
230+
"""
231+
result = {}
232+
for key, value in index_creation_kwargs.items():
233+
if hasattr(value, "as_dict"):
234+
result[key] = value.as_dict()
235+
elif isinstance(value, list) and all(hasattr(item, "as_dict") for item in value):
236+
result[key] = [item.as_dict() for item in value]
237+
else:
238+
result[key] = value
239+
return result
240+
241+
@classmethod
242+
def _deserialize_index_creation_kwargs(cls, data: Dict[str, Any]) -> Any:
243+
"""
244+
Deserializes the index creation kwargs to the original classes.
245+
"""
246+
result = {}
247+
for key, value in data.items():
248+
if key in AZURE_CLASS_MAPPING:
249+
if isinstance(value, list):
250+
result[key] = [AZURE_CLASS_MAPPING[key].from_dict(item) for item in value]
251+
else:
252+
result[key] = AZURE_CLASS_MAPPING[key].from_dict(value)
253+
elif isinstance(value, dict) and hasattr(value, "from_dict"):
254+
result[key] = value.from_dict(value)
255+
else:
256+
result[key] = value
257+
258+
return result[key]
259+
191260
def to_dict(self) -> Dict[str, Any]:
192261
# This is not the best solution to serialise this class but is the fastest to implement.
193262
# Not all kwargs types can be serialised to text so this can fail. We must serialise each
@@ -198,15 +267,21 @@ def to_dict(self) -> Dict[str, Any]:
198267
:returns:
199268
Dictionary with serialized data.
200269
"""
270+
271+
if self._metadata_fields:
272+
serialized_metadata = {key: value.__name__ for key, value in self._metadata_fields.items()}
273+
else:
274+
serialized_metadata = None
275+
201276
return default_to_dict(
202277
self,
203278
azure_endpoint=self._azure_endpoint.to_dict() if self._azure_endpoint else None,
204279
api_key=self._api_key.to_dict() if self._api_key else None,
205280
index_name=self._index_name,
206281
embedding_dimension=self._embedding_dimension,
207-
metadata_fields=self._metadata_fields,
282+
metadata_fields=serialized_metadata,
208283
vector_search_configuration=self._vector_search_configuration.as_dict(),
209-
**self._index_creation_kwargs,
284+
**self._serialize_index_creation_kwargs(self._index_creation_kwargs),
210285
)
211286

212287
@classmethod
@@ -220,6 +295,13 @@ def from_dict(cls, data: Dict[str, Any]) -> "AzureAISearchDocumentStore":
220295
:returns:
221296
Deserialized component.
222297
"""
298+
if (fields := data["init_parameters"]["metadata_fields"]) is not None:
299+
data["init_parameters"]["metadata_fields"] = cls._deserialize_metadata_fields(fields)
300+
301+
for key, _value in AZURE_CLASS_MAPPING.items():
302+
if key in data["init_parameters"]:
303+
param_value = data["init_parameters"].get(key)
304+
data["init_parameters"][key] = cls._deserialize_index_creation_kwargs({key: param_value})
223305

224306
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key", "azure_endpoint"])
225307
if (vector_search_configuration := data["init_parameters"].get("vector_search_configuration")) is not None:

integrations/azure_ai_search/tests/test_document_store.py

Lines changed: 127 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from unittest.mock import patch
99

1010
import pytest
11+
from azure.search.documents.indexes.models import CustomAnalyzer, SearchResourceEncryptionKey
1112
from haystack.dataclasses.document import Document
1213
from haystack.errors import FilterError
1314
from haystack.testing.document_store import (
@@ -18,10 +19,12 @@
1819
)
1920
from haystack.utils.auth import EnvVarSecret, Secret
2021

21-
from haystack_integrations.document_stores.azure_ai_search import DEFAULT_VECTOR_SEARCH, AzureAISearchDocumentStore
22+
from haystack_integrations.document_stores.azure_ai_search import (
23+
DEFAULT_VECTOR_SEARCH,
24+
AzureAISearchDocumentStore,
25+
)
2226

2327

24-
@patch("haystack_integrations.document_stores.azure_ai_search.document_store.AzureAISearchDocumentStore")
2528
def test_to_dict(monkeypatch):
2629
monkeypatch.setenv("AZURE_AI_SEARCH_API_KEY", "test-api-key")
2730
monkeypatch.setenv("AZURE_AI_SEARCH_ENDPOINT", "test-endpoint")
@@ -51,7 +54,68 @@ def test_to_dict(monkeypatch):
5154
}
5255

5356

54-
@patch("haystack_integrations.document_stores.azure_ai_search.document_store.AzureAISearchDocumentStore")
57+
def test_to_dict_with_params(monkeypatch):
58+
monkeypatch.setenv("AZURE_AI_SEARCH_API_KEY", "test-api-key")
59+
monkeypatch.setenv("AZURE_AI_SEARCH_ENDPOINT", "test-endpoint")
60+
encryption_key = SearchResourceEncryptionKey(
61+
key_name="my-key",
62+
key_version="my-version",
63+
vault_uri="my-uri",
64+
)
65+
analyzer = CustomAnalyzer(
66+
name="url-analyze",
67+
tokenizer_name="uax_url_email",
68+
token_filters=["lowercase"], # Using token filter name directly as string
69+
)
70+
document_store = AzureAISearchDocumentStore(
71+
index_name="my_index",
72+
embedding_dimension=15,
73+
metadata_fields={"Title": str, "Pages": int},
74+
encryption_key=encryption_key,
75+
analyzers=[analyzer],
76+
)
77+
78+
res = document_store.to_dict()
79+
assert res == {
80+
"type": "haystack_integrations.document_stores.azure_ai_search.document_store.AzureAISearchDocumentStore",
81+
"init_parameters": {
82+
"azure_endpoint": {"env_vars": ["AZURE_AI_SEARCH_ENDPOINT"], "strict": True, "type": "env_var"},
83+
"api_key": {"env_vars": ["AZURE_AI_SEARCH_API_KEY"], "strict": False, "type": "env_var"},
84+
"index_name": "my_index",
85+
"embedding_dimension": 15,
86+
"metadata_fields": {
87+
"Title": "str",
88+
"Pages": "int",
89+
},
90+
"encryption_key": {
91+
"key_name": "my-key",
92+
"key_version": "my-version",
93+
"vault_uri": "my-uri",
94+
},
95+
"analyzers": [
96+
{
97+
"name": "url-analyze",
98+
"odata_type": "#Microsoft.Azure.Search.CustomAnalyzer",
99+
"tokenizer_name": "uax_url_email",
100+
"token_filters": ["lowercase"],
101+
}
102+
],
103+
"vector_search_configuration": {
104+
"profiles": [
105+
{"name": "default-vector-config", "algorithm_configuration_name": "cosine-algorithm-config"}
106+
],
107+
"algorithms": [
108+
{
109+
"name": "cosine-algorithm-config",
110+
"kind": "hnsw",
111+
"parameters": {"m": 4, "ef_construction": 400, "ef_search": 500, "metric": "cosine"},
112+
}
113+
],
114+
},
115+
},
116+
}
117+
118+
55119
def test_from_dict(monkeypatch):
56120
monkeypatch.setenv("AZURE_AI_SEARCH_API_KEY", "test-api-key")
57121
monkeypatch.setenv("AZURE_AI_SEARCH_ENDPOINT", "test-endpoint")
@@ -76,6 +140,66 @@ def test_from_dict(monkeypatch):
76140
assert document_store._vector_search_configuration == DEFAULT_VECTOR_SEARCH
77141

78142

143+
def test_from_dict_with_params(monkeypatch):
144+
monkeypatch.setenv("AZURE_AI_SEARCH_API_KEY", "test-api-key")
145+
monkeypatch.setenv("AZURE_AI_SEARCH_ENDPOINT", "test-endpoint")
146+
encryption_key = SearchResourceEncryptionKey(
147+
key_name="my-key",
148+
key_version="my-version",
149+
vault_uri="my-uri",
150+
)
151+
152+
data = {
153+
"type": "haystack_integrations.document_stores.azure_ai_search.document_store.AzureAISearchDocumentStore",
154+
"init_parameters": {
155+
"azure_endpoint": {"env_vars": ["AZURE_AI_SEARCH_ENDPOINT"], "strict": True, "type": "env_var"},
156+
"api_key": {"env_vars": ["AZURE_AI_SEARCH_API_KEY"], "strict": False, "type": "env_var"},
157+
"index_name": "my_index",
158+
"embedding_dimension": 15,
159+
"metadata_fields": {
160+
"Title": "str",
161+
"Pages": "int",
162+
},
163+
"encryption_key": {
164+
"key_name": "my-key",
165+
"key_version": "my-version",
166+
"vault_uri": "my-uri",
167+
},
168+
"analyzers": [
169+
{
170+
"name": "url-analyze",
171+
"odata_type": "#Microsoft.Azure.Search.CustomAnalyzer",
172+
"tokenizer_name": "uax_url_email",
173+
"token_filters": ["lowercase"],
174+
}
175+
],
176+
"vector_search_configuration": {
177+
"profiles": [
178+
{"name": "default-vector-config", "algorithm_configuration_name": "cosine-algorithm-config"}
179+
],
180+
"algorithms": [
181+
{
182+
"name": "cosine-algorithm-config",
183+
"kind": "hnsw",
184+
"parameters": {"m": 4, "ef_construction": 400, "ef_search": 500, "metric": "cosine"},
185+
}
186+
],
187+
},
188+
},
189+
}
190+
document_store = AzureAISearchDocumentStore.from_dict(data)
191+
assert isinstance(document_store._api_key, EnvVarSecret)
192+
assert isinstance(document_store._azure_endpoint, EnvVarSecret)
193+
assert document_store._index_name == "my_index"
194+
assert document_store._embedding_dimension == 15
195+
assert document_store._metadata_fields == {"Title": str, "Pages": int}
196+
assert document_store._index_creation_kwargs["encryption_key"] == encryption_key
197+
assert document_store._index_creation_kwargs["analyzers"][0].name == "url-analyze"
198+
assert document_store._index_creation_kwargs["analyzers"][0].token_filters == ["lowercase"]
199+
assert "CustomAnalyzer" in document_store._index_creation_kwargs["analyzers"][0].odata_type
200+
assert document_store._vector_search_configuration.as_dict() == DEFAULT_VECTOR_SEARCH.as_dict()
201+
202+
79203
@patch("haystack_integrations.document_stores.azure_ai_search.document_store.AzureAISearchDocumentStore")
80204
def test_init_is_lazy(_mock_azure_search_client):
81205
AzureAISearchDocumentStore(azure_endpoint=Secret.from_token("test_endpoint"))
@@ -84,7 +208,6 @@ def test_init_is_lazy(_mock_azure_search_client):
84208

85209
@patch("haystack_integrations.document_stores.azure_ai_search.document_store.AzureAISearchDocumentStore")
86210
def test_init(_mock_azure_search_client):
87-
88211
document_store = AzureAISearchDocumentStore(
89212
api_key=Secret.from_token("fake-api-key"),
90213
azure_endpoint=Secret.from_token("fake_endpoint"),

0 commit comments

Comments
 (0)