Skip to content

Commit 333f56b

Browse files
fix(PLU-348): azure ai search improvement dropping fields (#701)
1 parent 2bd3dc0 commit 333f56b

5 files changed

Lines changed: 633 additions & 42 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
## [1.5.1]
2+
3+
### Fixes
4+
5+
- **fix(azure_ai_search): recursively drop unknown fields against the index schema.** New nested fields from `unstructured` (e.g. `metadata.table_extraction_method`) were reaching the service and causing HTTP 400s; the filter now recurses through `index.fields`.
6+
17
## [1.5.0]
28

39
### Enhancements

test/integration/connectors/test_azure_ai_search.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,62 @@ async def test_azure_ai_search_destination(
237237
validate_count(search_client=search_client, expected_count=expected_count)
238238

239239

240+
@pytest.mark.asyncio
241+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
242+
@requires_env("AZURE_SEARCH_API_KEY")
243+
async def test_azure_ai_search_destination_drops_unknown_nested_fields(
244+
upload_file: Path,
245+
index: str,
246+
tmp_path: Path,
247+
):
248+
"""Regression test for the 400 reported on 2026-05-01: the new ``unstructured`` field
249+
``metadata.table_extraction_method`` was rejected by Azure's strict ComplexType validation.
250+
251+
Stages real elements, injects an unknown nested field and a stray top-level field, and
252+
runs the uploader against a real Azure index. The upload succeeding is itself the
253+
assertion: without the recursive filter, ``run_data`` would raise on Azure's 400 long
254+
before the document-count check; with the filter in place the unknown fields are pruned
255+
pre-upload and every staged element is indexed. Round-tripping a doc to assert the
256+
injected fields are absent would not add signal — Azure projects responses over the
257+
declared schema, so unknown fields are invisible on read regardless of who dropped them.
258+
"""
259+
file_data = FileData(
260+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
261+
connector_type=CONNECTOR_TYPE,
262+
identifier="mock file data nested filter",
263+
)
264+
stager = AzureAISearchUploadStager(upload_stager_config=AzureAISearchUploadStagerConfig())
265+
266+
uploader = AzureAISearchUploader(
267+
connection_config=AzureAISearchConnectionConfig(
268+
access_config=AzureAISearchAccessConfig(key=get_api_key()),
269+
endpoint=ENDPOINT,
270+
index=index,
271+
),
272+
upload_config=AzureAISearchUploaderConfig(),
273+
)
274+
staged_filepath = stager.run(
275+
elements_filepath=upload_file,
276+
file_data=file_data,
277+
output_dir=tmp_path,
278+
output_filename=upload_file.name,
279+
)
280+
281+
with staged_filepath.open() as f:
282+
staged_elements = json.load(f)
283+
assert staged_elements, "expected staged elements for the regression test"
284+
for element in staged_elements:
285+
element.setdefault("metadata", {})["table_extraction_method"] = "auto"
286+
element["future_undeclared_top_level_field"] = "drop me"
287+
288+
uploader.precheck()
289+
uploader.run_data(data=staged_elements, file_data=file_data)
290+
291+
expected_count = len(staged_elements)
292+
with uploader.connection_config.get_search_client() as search_client:
293+
validate_count(search_client=search_client, expected_count=expected_count)
294+
295+
240296
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
241297
@pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
242298
def test_azure_ai_search_stager(

0 commit comments

Comments
 (0)