Skip to content

Commit 4b475ce

Browse files
authored
test: AzureDocumentIntelligenceConverter - add tests (#3205)
1 parent 0a9303f commit 4b475ce

1 file changed

Lines changed: 146 additions & 0 deletions

File tree

integrations/azure_doc_intelligence/tests/test_azure_document_intelligence_converter.py

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,34 @@
44

55
import os
66
from pathlib import Path
7+
from unittest.mock import MagicMock, patch
78

89
import pytest
10+
from haystack.dataclasses import ByteStream
911
from haystack.utils import Secret
1012

1113
from haystack_integrations.components.converters.azure_doc_intelligence import AzureDocumentIntelligenceConverter
1214

1315

16+
def make_mock_analyze_result(content="# Title\n\nBody", pages=("p1",), as_dict_value=None):
17+
result = MagicMock()
18+
result.content = content
19+
result.pages = list(pages) if pages is not None else None
20+
result.as_dict.return_value = as_dict_value if as_dict_value is not None else {"content": content}
21+
return result
22+
23+
24+
@pytest.fixture
25+
def warmed_converter():
26+
converter = AzureDocumentIntelligenceConverter(
27+
endpoint="https://test.cognitiveservices.azure.com/",
28+
api_key=Secret.from_token("test_api_key"),
29+
)
30+
converter.client = MagicMock()
31+
converter.client.begin_analyze_document.return_value.result.return_value = make_mock_analyze_result()
32+
return converter
33+
34+
1435
class TestAzureDocumentIntelligenceConverter:
1536
def test_init_default(self):
1637
"""Test basic initialization with defaults"""
@@ -96,6 +117,131 @@ def test_from_dict(self):
96117
assert converter.model_id == "prebuilt-layout"
97118
assert converter.store_full_path is False
98119

120+
def test_warm_up_initializes_client_only_once(self):
121+
converter = AzureDocumentIntelligenceConverter(
122+
endpoint="https://test.cognitiveservices.azure.com/",
123+
api_key=Secret.from_token("test_api_key"),
124+
)
125+
assert converter.client is None
126+
with patch(
127+
"haystack_integrations.components.converters.azure_doc_intelligence.converter.DocumentIntelligenceClient"
128+
) as mock_client_cls:
129+
converter.warm_up()
130+
assert converter.client is mock_client_cls.return_value
131+
mock_client_cls.assert_called_once()
132+
converter.warm_up()
133+
mock_client_cls.assert_called_once()
134+
135+
def test_run_calls_warm_up_when_client_is_none(self):
136+
converter = AzureDocumentIntelligenceConverter(
137+
endpoint="https://test.cognitiveservices.azure.com/",
138+
api_key=Secret.from_token("test_api_key"),
139+
)
140+
with patch(
141+
"haystack_integrations.components.converters.azure_doc_intelligence.converter.DocumentIntelligenceClient"
142+
) as mock_client_cls:
143+
mock_client_cls.return_value.begin_analyze_document.return_value.result.return_value = (
144+
make_mock_analyze_result()
145+
)
146+
result = converter.run(sources=[ByteStream.from_string("data")])
147+
148+
assert converter.client is mock_client_cls.return_value
149+
assert len(result["documents"]) == 1
150+
151+
def test_run_returns_document_with_markdown_content_and_meta(self, warmed_converter):
152+
warmed_converter.client.begin_analyze_document.return_value.result.return_value = make_mock_analyze_result(
153+
content="# Heading\n\nHello", pages=("p1", "p2", "p3")
154+
)
155+
156+
result = warmed_converter.run(sources=[ByteStream.from_string("data")])
157+
158+
assert len(result["documents"]) == 1
159+
doc = result["documents"][0]
160+
assert doc.content == "# Heading\n\nHello"
161+
assert doc.meta["model_id"] == "prebuilt-document"
162+
assert doc.meta["page_count"] == 3
163+
164+
def test_run_returns_raw_azure_response(self, warmed_converter):
165+
raw_dict = {"content": "text", "pages": [{"page_number": 1}]}
166+
warmed_converter.client.begin_analyze_document.return_value.result.return_value = make_mock_analyze_result(
167+
content="text", as_dict_value=raw_dict
168+
)
169+
170+
result = warmed_converter.run(sources=[ByteStream.from_string("data")])
171+
172+
assert result["raw_azure_response"] == [raw_dict]
173+
174+
def test_run_with_multiple_sources(self, warmed_converter):
175+
sources = [ByteStream.from_string("one"), ByteStream.from_string("two")]
176+
177+
result = warmed_converter.run(sources=sources)
178+
179+
assert len(result["documents"]) == 2
180+
assert len(result["raw_azure_response"]) == 2
181+
182+
@pytest.mark.parametrize("store_full_path", [True, False])
183+
def test_run_respects_store_full_path(self, store_full_path):
184+
pdf_path = Path(__file__).parent / "test_files" / "pdf" / "sample_pdf_1.pdf"
185+
converter = AzureDocumentIntelligenceConverter(
186+
endpoint="https://test.cognitiveservices.azure.com/",
187+
api_key=Secret.from_token("test_api_key"),
188+
store_full_path=store_full_path,
189+
)
190+
converter.client = MagicMock()
191+
converter.client.begin_analyze_document.return_value.result.return_value = make_mock_analyze_result()
192+
193+
result = converter.run(sources=[str(pdf_path)])
194+
195+
expected_path = str(pdf_path) if store_full_path else "sample_pdf_1.pdf"
196+
assert result["documents"][0].meta["file_path"] == expected_path
197+
198+
def test_run_applies_single_meta_dict_to_all_documents(self, warmed_converter):
199+
sources = [ByteStream.from_string("one"), ByteStream.from_string("two")]
200+
201+
result = warmed_converter.run(sources=sources, meta={"shared": "value"})
202+
203+
assert all(doc.meta["shared"] == "value" for doc in result["documents"])
204+
205+
def test_run_applies_meta_list_pairwise(self, warmed_converter):
206+
sources = [ByteStream.from_string("a"), ByteStream.from_string("b")]
207+
208+
result = warmed_converter.run(sources=sources, meta=[{"index": 0}, {"index": 1}])
209+
210+
assert result["documents"][0].meta["index"] == 0
211+
assert result["documents"][1].meta["index"] == 1
212+
213+
def test_run_skips_unreadable_source(self, warmed_converter):
214+
result = warmed_converter.run(sources=["/nonexistent/missing.pdf"])
215+
216+
assert result["documents"] == []
217+
assert result["raw_azure_response"] == []
218+
219+
def test_run_skips_source_when_azure_analysis_fails(self, warmed_converter):
220+
warmed_converter.client.begin_analyze_document.side_effect = RuntimeError("Azure failure")
221+
222+
result = warmed_converter.run(sources=[ByteStream.from_string("data")])
223+
224+
assert result["documents"] == []
225+
assert result["raw_azure_response"] == []
226+
227+
def test_run_uses_empty_string_when_result_content_is_none(self, warmed_converter):
228+
warmed_converter.client.begin_analyze_document.return_value.result.return_value = make_mock_analyze_result(
229+
content=None
230+
)
231+
232+
result = warmed_converter.run(sources=[ByteStream.from_string("data")])
233+
234+
assert result["documents"][0].content == ""
235+
236+
def test_run_sets_page_count_zero_when_result_has_no_pages(self, warmed_converter):
237+
warmed_converter.client.begin_analyze_document.return_value.result.return_value = make_mock_analyze_result(
238+
pages=None
239+
)
240+
241+
result = warmed_converter.run(sources=[ByteStream.from_string("data")])
242+
243+
assert result["documents"][0].meta["page_count"] == 0
244+
99245

100246
@pytest.mark.integration
101247
class TestAzureDocumentIntelligenceConverterIntegration:

0 commit comments

Comments
 (0)