|
4 | 4 |
|
5 | 5 | import os |
6 | 6 | from pathlib import Path |
| 7 | +from unittest.mock import MagicMock, patch |
7 | 8 |
|
8 | 9 | import pytest |
| 10 | +from haystack.dataclasses import ByteStream |
9 | 11 | from haystack.utils import Secret |
10 | 12 |
|
11 | 13 | from haystack_integrations.components.converters.azure_doc_intelligence import AzureDocumentIntelligenceConverter |
12 | 14 |
|
13 | 15 |
|
| 16 | +def make_mock_analyze_result(content="# Title\n\nBody", pages=("p1",), as_dict_value=None): |
| 17 | + result = MagicMock() |
| 18 | + result.content = content |
| 19 | + result.pages = list(pages) if pages is not None else None |
| 20 | + result.as_dict.return_value = as_dict_value if as_dict_value is not None else {"content": content} |
| 21 | + return result |
| 22 | + |
| 23 | + |
| 24 | +@pytest.fixture |
| 25 | +def warmed_converter(): |
| 26 | + converter = AzureDocumentIntelligenceConverter( |
| 27 | + endpoint="https://test.cognitiveservices.azure.com/", |
| 28 | + api_key=Secret.from_token("test_api_key"), |
| 29 | + ) |
| 30 | + converter.client = MagicMock() |
| 31 | + converter.client.begin_analyze_document.return_value.result.return_value = make_mock_analyze_result() |
| 32 | + return converter |
| 33 | + |
| 34 | + |
14 | 35 | class TestAzureDocumentIntelligenceConverter: |
15 | 36 | def test_init_default(self): |
16 | 37 | """Test basic initialization with defaults""" |
@@ -96,6 +117,131 @@ def test_from_dict(self): |
96 | 117 | assert converter.model_id == "prebuilt-layout" |
97 | 118 | assert converter.store_full_path is False |
98 | 119 |
|
| 120 | + def test_warm_up_initializes_client_only_once(self): |
| 121 | + converter = AzureDocumentIntelligenceConverter( |
| 122 | + endpoint="https://test.cognitiveservices.azure.com/", |
| 123 | + api_key=Secret.from_token("test_api_key"), |
| 124 | + ) |
| 125 | + assert converter.client is None |
| 126 | + with patch( |
| 127 | + "haystack_integrations.components.converters.azure_doc_intelligence.converter.DocumentIntelligenceClient" |
| 128 | + ) as mock_client_cls: |
| 129 | + converter.warm_up() |
| 130 | + assert converter.client is mock_client_cls.return_value |
| 131 | + mock_client_cls.assert_called_once() |
| 132 | + converter.warm_up() |
| 133 | + mock_client_cls.assert_called_once() |
| 134 | + |
| 135 | + def test_run_calls_warm_up_when_client_is_none(self): |
| 136 | + converter = AzureDocumentIntelligenceConverter( |
| 137 | + endpoint="https://test.cognitiveservices.azure.com/", |
| 138 | + api_key=Secret.from_token("test_api_key"), |
| 139 | + ) |
| 140 | + with patch( |
| 141 | + "haystack_integrations.components.converters.azure_doc_intelligence.converter.DocumentIntelligenceClient" |
| 142 | + ) as mock_client_cls: |
| 143 | + mock_client_cls.return_value.begin_analyze_document.return_value.result.return_value = ( |
| 144 | + make_mock_analyze_result() |
| 145 | + ) |
| 146 | + result = converter.run(sources=[ByteStream.from_string("data")]) |
| 147 | + |
| 148 | + assert converter.client is mock_client_cls.return_value |
| 149 | + assert len(result["documents"]) == 1 |
| 150 | + |
| 151 | + def test_run_returns_document_with_markdown_content_and_meta(self, warmed_converter): |
| 152 | + warmed_converter.client.begin_analyze_document.return_value.result.return_value = make_mock_analyze_result( |
| 153 | + content="# Heading\n\nHello", pages=("p1", "p2", "p3") |
| 154 | + ) |
| 155 | + |
| 156 | + result = warmed_converter.run(sources=[ByteStream.from_string("data")]) |
| 157 | + |
| 158 | + assert len(result["documents"]) == 1 |
| 159 | + doc = result["documents"][0] |
| 160 | + assert doc.content == "# Heading\n\nHello" |
| 161 | + assert doc.meta["model_id"] == "prebuilt-document" |
| 162 | + assert doc.meta["page_count"] == 3 |
| 163 | + |
| 164 | + def test_run_returns_raw_azure_response(self, warmed_converter): |
| 165 | + raw_dict = {"content": "text", "pages": [{"page_number": 1}]} |
| 166 | + warmed_converter.client.begin_analyze_document.return_value.result.return_value = make_mock_analyze_result( |
| 167 | + content="text", as_dict_value=raw_dict |
| 168 | + ) |
| 169 | + |
| 170 | + result = warmed_converter.run(sources=[ByteStream.from_string("data")]) |
| 171 | + |
| 172 | + assert result["raw_azure_response"] == [raw_dict] |
| 173 | + |
| 174 | + def test_run_with_multiple_sources(self, warmed_converter): |
| 175 | + sources = [ByteStream.from_string("one"), ByteStream.from_string("two")] |
| 176 | + |
| 177 | + result = warmed_converter.run(sources=sources) |
| 178 | + |
| 179 | + assert len(result["documents"]) == 2 |
| 180 | + assert len(result["raw_azure_response"]) == 2 |
| 181 | + |
| 182 | + @pytest.mark.parametrize("store_full_path", [True, False]) |
| 183 | + def test_run_respects_store_full_path(self, store_full_path): |
| 184 | + pdf_path = Path(__file__).parent / "test_files" / "pdf" / "sample_pdf_1.pdf" |
| 185 | + converter = AzureDocumentIntelligenceConverter( |
| 186 | + endpoint="https://test.cognitiveservices.azure.com/", |
| 187 | + api_key=Secret.from_token("test_api_key"), |
| 188 | + store_full_path=store_full_path, |
| 189 | + ) |
| 190 | + converter.client = MagicMock() |
| 191 | + converter.client.begin_analyze_document.return_value.result.return_value = make_mock_analyze_result() |
| 192 | + |
| 193 | + result = converter.run(sources=[str(pdf_path)]) |
| 194 | + |
| 195 | + expected_path = str(pdf_path) if store_full_path else "sample_pdf_1.pdf" |
| 196 | + assert result["documents"][0].meta["file_path"] == expected_path |
| 197 | + |
| 198 | + def test_run_applies_single_meta_dict_to_all_documents(self, warmed_converter): |
| 199 | + sources = [ByteStream.from_string("one"), ByteStream.from_string("two")] |
| 200 | + |
| 201 | + result = warmed_converter.run(sources=sources, meta={"shared": "value"}) |
| 202 | + |
| 203 | + assert all(doc.meta["shared"] == "value" for doc in result["documents"]) |
| 204 | + |
| 205 | + def test_run_applies_meta_list_pairwise(self, warmed_converter): |
| 206 | + sources = [ByteStream.from_string("a"), ByteStream.from_string("b")] |
| 207 | + |
| 208 | + result = warmed_converter.run(sources=sources, meta=[{"index": 0}, {"index": 1}]) |
| 209 | + |
| 210 | + assert result["documents"][0].meta["index"] == 0 |
| 211 | + assert result["documents"][1].meta["index"] == 1 |
| 212 | + |
| 213 | + def test_run_skips_unreadable_source(self, warmed_converter): |
| 214 | + result = warmed_converter.run(sources=["/nonexistent/missing.pdf"]) |
| 215 | + |
| 216 | + assert result["documents"] == [] |
| 217 | + assert result["raw_azure_response"] == [] |
| 218 | + |
| 219 | + def test_run_skips_source_when_azure_analysis_fails(self, warmed_converter): |
| 220 | + warmed_converter.client.begin_analyze_document.side_effect = RuntimeError("Azure failure") |
| 221 | + |
| 222 | + result = warmed_converter.run(sources=[ByteStream.from_string("data")]) |
| 223 | + |
| 224 | + assert result["documents"] == [] |
| 225 | + assert result["raw_azure_response"] == [] |
| 226 | + |
| 227 | + def test_run_uses_empty_string_when_result_content_is_none(self, warmed_converter): |
| 228 | + warmed_converter.client.begin_analyze_document.return_value.result.return_value = make_mock_analyze_result( |
| 229 | + content=None |
| 230 | + ) |
| 231 | + |
| 232 | + result = warmed_converter.run(sources=[ByteStream.from_string("data")]) |
| 233 | + |
| 234 | + assert result["documents"][0].content == "" |
| 235 | + |
| 236 | + def test_run_sets_page_count_zero_when_result_has_no_pages(self, warmed_converter): |
| 237 | + warmed_converter.client.begin_analyze_document.return_value.result.return_value = make_mock_analyze_result( |
| 238 | + pages=None |
| 239 | + ) |
| 240 | + |
| 241 | + result = warmed_converter.run(sources=[ByteStream.from_string("data")]) |
| 242 | + |
| 243 | + assert result["documents"][0].meta["page_count"] == 0 |
| 244 | + |
99 | 245 |
|
100 | 246 | @pytest.mark.integration |
101 | 247 | class TestAzureDocumentIntelligenceConverterIntegration: |
|
0 commit comments