|
3 | 3 | # SPDX-License-Identifier: Apache-2.0 |
4 | 4 |
|
5 | 5 | import random |
6 | | -from unittest.mock import patch |
| 6 | +from unittest.mock import AsyncMock, MagicMock, patch |
7 | 7 |
|
8 | 8 | import pytest |
9 | 9 | from haystack.dataclasses.document import Document |
|
17 | 17 | GetMetadataFieldsInfoTest, |
18 | 18 | GetMetadataFieldUniqueValuesTest, |
19 | 19 | ) |
20 | | -from opensearchpy.exceptions import RequestError |
| 20 | +from opensearchpy.exceptions import RequestError, TransportError |
21 | 21 |
|
22 | 22 | from haystack_integrations.document_stores.opensearch import OpenSearchDocumentStore |
23 | 23 | from haystack_integrations.document_stores.opensearch.document_store import DEFAULT_MAX_CHUNK_BYTES |
@@ -224,6 +224,155 @@ def test_routing_in_delete(mock_bulk, _mock_opensearch_client): |
224 | 224 | assert "_routing" not in actions[2] |
225 | 225 |
|
226 | 226 |
|
| 227 | +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") |
| 228 | +def test_bm25_retrieval_retries_with_fuzziness_zero_on_too_many_clauses(_mock_opensearch_client, caplog): |
| 229 | + store = OpenSearchDocumentStore(hosts="testhost") |
| 230 | + store._client = MagicMock() |
| 231 | + |
| 232 | + too_many_clauses_error = TransportError( |
| 233 | + 500, "search_phase_execution_exception", "too_many_clauses: maxClauseCount is set to 1024" |
| 234 | + ) |
| 235 | + store._client.search.side_effect = [ |
| 236 | + too_many_clauses_error, |
| 237 | + {"hits": {"hits": []}}, |
| 238 | + ] |
| 239 | + |
| 240 | + results = store._bm25_retrieval("a very long query", fuzziness="AUTO") |
| 241 | + |
| 242 | + assert results == [] |
| 243 | + assert store._client.search.call_count == 2 |
| 244 | + # Verify the retry used fuzziness=0 |
| 245 | + second_call_body = store._client.search.call_args_list[1].kwargs["body"] |
| 246 | + assert second_call_body["query"]["bool"]["must"][0]["multi_match"]["fuzziness"] == 0 |
| 247 | + assert "Retrying with fuzziness=0" in caplog.text |
| 248 | + |
| 249 | + |
| 250 | +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") |
| 251 | +def test_bm25_retrieval_no_retry_when_fuzziness_already_zero(_mock_opensearch_client): |
| 252 | + store = OpenSearchDocumentStore(hosts="testhost") |
| 253 | + store._client = MagicMock() |
| 254 | + |
| 255 | + too_many_clauses_error = TransportError( |
| 256 | + 500, "search_phase_execution_exception", "too_many_clauses: maxClauseCount is set to 1024" |
| 257 | + ) |
| 258 | + store._client.search.side_effect = too_many_clauses_error |
| 259 | + |
| 260 | + with pytest.raises(TransportError): |
| 261 | + store._bm25_retrieval("a very long query", fuzziness=0) |
| 262 | + |
| 263 | + assert store._client.search.call_count == 1 |
| 264 | + |
| 265 | + |
| 266 | +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") |
| 267 | +def test_bm25_retrieval_no_retry_with_custom_query(_mock_opensearch_client): |
| 268 | + store = OpenSearchDocumentStore(hosts="testhost") |
| 269 | + store._client = MagicMock() |
| 270 | + |
| 271 | + too_many_clauses_error = TransportError( |
| 272 | + 500, "search_phase_execution_exception", "too_many_clauses: maxClauseCount is set to 1024" |
| 273 | + ) |
| 274 | + store._client.search.side_effect = too_many_clauses_error |
| 275 | + |
| 276 | + custom_query = {"query": {"match": {"content": "$query"}}} |
| 277 | + with pytest.raises(TransportError): |
| 278 | + store._bm25_retrieval("a very long query", fuzziness="AUTO", custom_query=custom_query) |
| 279 | + |
| 280 | + assert store._client.search.call_count == 1 |
| 281 | + |
| 282 | + |
| 283 | +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") |
| 284 | +def test_bm25_retrieval_reraises_other_transport_errors(_mock_opensearch_client): |
| 285 | + store = OpenSearchDocumentStore(hosts="testhost") |
| 286 | + store._client = MagicMock() |
| 287 | + |
| 288 | + other_error = TransportError(500, "parsing_exception", {"error": {"reason": "some other error"}}) |
| 289 | + store._client.search.side_effect = other_error |
| 290 | + |
| 291 | + with pytest.raises(TransportError): |
| 292 | + store._bm25_retrieval("some query", fuzziness="AUTO") |
| 293 | + |
| 294 | + assert store._client.search.call_count == 1 |
| 295 | + |
| 296 | + |
| 297 | +@pytest.mark.asyncio |
| 298 | +@patch("haystack_integrations.document_stores.opensearch.document_store.AsyncOpenSearch") |
| 299 | +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") |
| 300 | +async def test_bm25_retrieval_async_retries_with_fuzziness_zero_on_too_many_clauses( |
| 301 | + _mock_opensearch_client, _mock_async_client, caplog |
| 302 | +): |
| 303 | + store = OpenSearchDocumentStore(hosts="testhost") |
| 304 | + store._async_client = AsyncMock() |
| 305 | + |
| 306 | + too_many_clauses_error = TransportError( |
| 307 | + 500, "search_phase_execution_exception", "too_many_clauses: maxClauseCount is set to 1024" |
| 308 | + ) |
| 309 | + store._async_client.search.side_effect = [ |
| 310 | + too_many_clauses_error, |
| 311 | + {"hits": {"hits": []}}, |
| 312 | + ] |
| 313 | + |
| 314 | + results = await store._bm25_retrieval_async("a very long query", fuzziness="AUTO") |
| 315 | + |
| 316 | + assert results == [] |
| 317 | + assert store._async_client.search.call_count == 2 |
| 318 | + second_call_body = store._async_client.search.call_args_list[1].kwargs["body"] |
| 319 | + assert second_call_body["query"]["bool"]["must"][0]["multi_match"]["fuzziness"] == 0 |
| 320 | + assert "Retrying with fuzziness=0" in caplog.text |
| 321 | + |
| 322 | + |
| 323 | +@pytest.mark.asyncio |
| 324 | +@patch("haystack_integrations.document_stores.opensearch.document_store.AsyncOpenSearch") |
| 325 | +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") |
| 326 | +async def test_bm25_retrieval_async_no_retry_when_fuzziness_already_zero(_mock_opensearch_client, _mock_async_client): |
| 327 | + store = OpenSearchDocumentStore(hosts="testhost") |
| 328 | + store._async_client = AsyncMock() |
| 329 | + |
| 330 | + too_many_clauses_error = TransportError( |
| 331 | + 500, "search_phase_execution_exception", "too_many_clauses: maxClauseCount is set to 1024" |
| 332 | + ) |
| 333 | + store._async_client.search.side_effect = too_many_clauses_error |
| 334 | + |
| 335 | + with pytest.raises(TransportError): |
| 336 | + await store._bm25_retrieval_async("a very long query", fuzziness=0) |
| 337 | + |
| 338 | + assert store._async_client.search.call_count == 1 |
| 339 | + |
| 340 | + |
| 341 | +@pytest.mark.asyncio |
| 342 | +@patch("haystack_integrations.document_stores.opensearch.document_store.AsyncOpenSearch") |
| 343 | +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") |
| 344 | +async def test_bm25_retrieval_async_no_retry_with_custom_query(_mock_opensearch_client, _mock_async_client): |
| 345 | + store = OpenSearchDocumentStore(hosts="testhost") |
| 346 | + store._async_client = AsyncMock() |
| 347 | + |
| 348 | + too_many_clauses_error = TransportError( |
| 349 | + 500, "search_phase_execution_exception", "too_many_clauses: maxClauseCount is set to 1024" |
| 350 | + ) |
| 351 | + store._async_client.search.side_effect = too_many_clauses_error |
| 352 | + |
| 353 | + custom_query = {"query": {"match": {"content": "$query"}}} |
| 354 | + with pytest.raises(TransportError): |
| 355 | + await store._bm25_retrieval_async("a very long query", fuzziness="AUTO", custom_query=custom_query) |
| 356 | + |
| 357 | + assert store._async_client.search.call_count == 1 |
| 358 | + |
| 359 | + |
| 360 | +@pytest.mark.asyncio |
| 361 | +@patch("haystack_integrations.document_stores.opensearch.document_store.AsyncOpenSearch") |
| 362 | +@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch") |
| 363 | +async def test_bm25_retrieval_async_reraises_other_transport_errors(_mock_opensearch_client, _mock_async_client): |
| 364 | + store = OpenSearchDocumentStore(hosts="testhost") |
| 365 | + store._async_client = AsyncMock() |
| 366 | + |
| 367 | + other_error = TransportError(500, "parsing_exception", {"error": {"reason": "some other error"}}) |
| 368 | + store._async_client.search.side_effect = other_error |
| 369 | + |
| 370 | + with pytest.raises(TransportError): |
| 371 | + await store._bm25_retrieval_async("some query", fuzziness="AUTO") |
| 372 | + |
| 373 | + assert store._async_client.search.call_count == 1 |
| 374 | + |
| 375 | + |
227 | 376 | @pytest.mark.integration |
228 | 377 | class TestDocumentStore( |
229 | 378 | CountDocumentsByFilterTest, |
@@ -333,6 +482,33 @@ def test_bm25_retrieval_with_fuzziness( |
333 | 482 | assert "functional" in res[1].content |
334 | 483 | assert "functional" in res[2].content |
335 | 484 |
|
| 485 | + def test_bm25_retrieval_with_fuzziness_overflow(self, document_store: OpenSearchDocumentStore, caplog): |
| 486 | + """ |
| 487 | + Test that a long query with fuzziness="AUTO" that exceeds OpenSearch's maxClauseCount |
| 488 | + is automatically retried with fuzziness=0 instead of raising an error. |
| 489 | + """ |
| 490 | + # Build an index vocabulary of similar 5-character words. With fuzziness="AUTO", |
| 491 | + # 5-char words get edit distance 1, so each query term fuzzy-matches many similar |
| 492 | + # indexed terms, causing clause expansion beyond the default maxClauseCount (1024). |
| 493 | + # With fuzziness=0, each term produces exactly 1 clause, staying well under the limit. |
| 494 | + words = [f"foo{chr(97 + i)}{chr(97 + j)}" for i in range(20) for j in range(26)] # 520 words |
| 495 | + |
| 496 | + chunk_size = 52 |
| 497 | + docs = [ |
| 498 | + Document(content=" ".join(words[i : i + chunk_size]), id=str(idx)) |
| 499 | + for idx, i in enumerate(range(0, len(words), chunk_size)) |
| 500 | + ] |
| 501 | + document_store.write_documents(docs) |
| 502 | + |
| 503 | + # Query with a subset of words. With fuzziness="AUTO", each 5-char term expands |
| 504 | + # to match ~45 similar indexed terms, pushing total clauses well above 1024. |
| 505 | + long_query = " ".join(words[:100]) |
| 506 | + |
| 507 | + # This should not raise: the too_many_clauses error is caught and retried with fuzziness=0 |
| 508 | + res = document_store._bm25_retrieval(long_query, top_k=3, fuzziness="AUTO") |
| 509 | + assert isinstance(res, list) |
| 510 | + assert "Retrying with fuzziness=0" in caplog.text |
| 511 | + |
336 | 512 | def test_bm25_retrieval_with_filters(self, document_store: OpenSearchDocumentStore, test_documents: list[Document]): |
337 | 513 | document_store.write_documents(test_documents) |
338 | 514 | res = document_store._bm25_retrieval( |
|
0 commit comments