Skip to content

Commit ee8c1aa

Browse files
Merge branch 'main' into fix/valkey-async-mixin-tests
2 parents 364614c + 86485b5 commit ee8c1aa

16 files changed

Lines changed: 601 additions & 324 deletions

File tree

.github/workflows/faiss.yml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,17 @@ jobs:
8989
SUBPROJECT_ID: faiss
9090
COMMENT_ARTIFACT_NAME: coverage-comment-faiss
9191

92-
# No integration tests yet — add integration-cov-append-retry + combined coverage step when needed
92+
- name: Run integration tests
93+
run: hatch run test:integration-cov-append-retry
9394

95+
- name: Store combined coverage
96+
if: github.event_name == 'push'
97+
uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40
98+
with:
99+
GITHUB_TOKEN: ${{ github.token }}
100+
COVERAGE_PATH: integrations/faiss
101+
SUBPROJECT_ID: faiss-combined
102+
COMMENT_ARTIFACT_NAME: coverage-comment-faiss-combined
94103
- name: Run unit tests with lowest direct dependencies
95104
if: matrix.python-version == '3.10' && runner.os == 'Linux'
96105
run: |

.github/workflows/qdrant.yml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,18 @@ jobs:
9393
SUBPROJECT_ID: qdrant
9494
COMMENT_ARTIFACT_NAME: coverage-comment-qdrant
9595

96-
# No integration tests yet — add integration-cov-append-retry + combined coverage step when needed
96+
- name: Run integration tests
97+
run: hatch run test:integration-cov-append-retry
9798

99+
- name: Store combined coverage
100+
if: github.event_name == 'push'
101+
uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40
102+
with:
103+
GITHUB_TOKEN: ${{ github.token }}
104+
COVERAGE_PATH: integrations/qdrant
105+
SUBPROJECT_ID: qdrant-combined
106+
COMMENT_ARTIFACT_NAME: coverage-comment-qdrant-combined
107+
98108
- name: Run unit tests with lowest direct dependencies
99109
if: github.event_name != 'push'
100110
run: |

README.md

Lines changed: 56 additions & 57 deletions
Large diffs are not rendered by default.

integrations/faiss/tests/test_document_store.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from haystack_integrations.document_stores.faiss import FAISSDocumentStore
2323

2424

25+
@pytest.mark.integration
2526
class TestFAISSDocumentStore(
2627
CountDocumentsTest,
2728
DeleteDocumentsTest,

integrations/faiss/tests/test_embedding_retriever.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@ def populated_store(document_store):
3131
return document_store
3232

3333

34+
def test_invalid_document_store_type():
35+
with pytest.raises(ValueError, match="document_store must be an instance of FAISSDocumentStore"):
36+
FAISSEmbeddingRetriever(document_store="not_a_store") # type: ignore[arg-type]
37+
38+
39+
@pytest.mark.integration
3440
class TestFAISSEmbeddingRetriever:
3541
def test_run_with_query_embedding_only(self, populated_store):
3642
retriever = FAISSEmbeddingRetriever(document_store=populated_store, top_k=2)
@@ -112,10 +118,6 @@ def test_filter_policy_merge(self, populated_store):
112118
assert len(result["documents"]) >= 1
113119
assert all(d.meta["category"] == "A" for d in result["documents"])
114120

115-
def test_invalid_document_store_type(self):
116-
with pytest.raises(ValueError, match="document_store must be an instance of FAISSDocumentStore"):
117-
FAISSEmbeddingRetriever(document_store="not_a_store") # type: ignore[arg-type]
118-
119121
def test_run_in_pipeline(self, populated_store):
120122
"""End-to-end: FAISSEmbeddingRetriever wired into a Haystack Pipeline."""
121123
retriever = FAISSEmbeddingRetriever(document_store=populated_store, top_k=2)

integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from haystack.document_stores.types import DuplicatePolicy
1515
from haystack.utils.auth import Secret
1616
from haystack.utils.misc import _normalize_metadata_field_name
17-
from opensearchpy import AsyncHttpConnection, AsyncOpenSearch, OpenSearch
17+
from opensearchpy import AsyncHttpConnection, AsyncOpenSearch, OpenSearch, TransportError
1818
from opensearchpy.helpers import async_bulk, bulk
1919

2020
from haystack_integrations.document_stores.opensearch.auth import AsyncAWSAuth, AWSAuth
@@ -979,7 +979,27 @@ def _bm25_retrieval(
979979
all_terms_must_match=all_terms_must_match,
980980
custom_query=custom_query,
981981
)
982-
documents = self._search_documents(search_params)
982+
try:
983+
documents = self._search_documents(search_params)
984+
except TransportError as e:
985+
if "too_many_clauses" in f"{e.info} {e.error}" and fuzziness not in (0, "0") and custom_query is None:
986+
logger.warning(
987+
"BM25 query with fuzziness='{fuzziness}' exceeded OpenSearch's clause limit. "
988+
"Retrying with fuzziness=0 (exact matching). Consider reducing query length or "
989+
"setting fuzziness=0 explicitly if this occurs frequently.",
990+
fuzziness=fuzziness,
991+
)
992+
search_params = self._prepare_bm25_search_request(
993+
query=query,
994+
filters=filters,
995+
fuzziness=0,
996+
top_k=top_k,
997+
all_terms_must_match=all_terms_must_match,
998+
custom_query=custom_query,
999+
)
1000+
documents = self._search_documents(search_params)
1001+
else:
1002+
raise
9831003
OpenSearchDocumentStore._postprocess_bm25_search_results(results=documents, scale_score=scale_score)
9841004
return documents
9851005

@@ -1019,7 +1039,27 @@ async def _bm25_retrieval_async(
10191039
all_terms_must_match=all_terms_must_match,
10201040
custom_query=custom_query,
10211041
)
1022-
documents = await self._search_documents_async(search_params)
1042+
try:
1043+
documents = await self._search_documents_async(search_params)
1044+
except TransportError as e:
1045+
if "too_many_clauses" in f"{e.info} {e.error}" and fuzziness not in (0, "0") and custom_query is None:
1046+
logger.warning(
1047+
"BM25 query with fuzziness='{fuzziness}' exceeded OpenSearch's clause limit. "
1048+
"Retrying with fuzziness=0 (exact matching). Consider reducing query length or "
1049+
"setting fuzziness=0 explicitly if this occurs frequently.",
1050+
fuzziness=fuzziness,
1051+
)
1052+
search_params = self._prepare_bm25_search_request(
1053+
query=query,
1054+
filters=filters,
1055+
fuzziness=0,
1056+
top_k=top_k,
1057+
all_terms_must_match=all_terms_must_match,
1058+
custom_query=custom_query,
1059+
)
1060+
documents = await self._search_documents_async(search_params)
1061+
else:
1062+
raise
10231063
OpenSearchDocumentStore._postprocess_bm25_search_results(results=documents, scale_score=scale_score)
10241064
return documents
10251065

integrations/opensearch/tests/test_document_store.py

Lines changed: 178 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# SPDX-License-Identifier: Apache-2.0
44

55
import random
6-
from unittest.mock import patch
6+
from unittest.mock import AsyncMock, MagicMock, patch
77

88
import pytest
99
from haystack.dataclasses.document import Document
@@ -17,7 +17,7 @@
1717
GetMetadataFieldsInfoTest,
1818
GetMetadataFieldUniqueValuesTest,
1919
)
20-
from opensearchpy.exceptions import RequestError
20+
from opensearchpy.exceptions import RequestError, TransportError
2121

2222
from haystack_integrations.document_stores.opensearch import OpenSearchDocumentStore
2323
from haystack_integrations.document_stores.opensearch.document_store import DEFAULT_MAX_CHUNK_BYTES
@@ -224,6 +224,155 @@ def test_routing_in_delete(mock_bulk, _mock_opensearch_client):
224224
assert "_routing" not in actions[2]
225225

226226

227+
@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch")
228+
def test_bm25_retrieval_retries_with_fuzziness_zero_on_too_many_clauses(_mock_opensearch_client, caplog):
229+
store = OpenSearchDocumentStore(hosts="testhost")
230+
store._client = MagicMock()
231+
232+
too_many_clauses_error = TransportError(
233+
500, "search_phase_execution_exception", "too_many_clauses: maxClauseCount is set to 1024"
234+
)
235+
store._client.search.side_effect = [
236+
too_many_clauses_error,
237+
{"hits": {"hits": []}},
238+
]
239+
240+
results = store._bm25_retrieval("a very long query", fuzziness="AUTO")
241+
242+
assert results == []
243+
assert store._client.search.call_count == 2
244+
# Verify the retry used fuzziness=0
245+
second_call_body = store._client.search.call_args_list[1].kwargs["body"]
246+
assert second_call_body["query"]["bool"]["must"][0]["multi_match"]["fuzziness"] == 0
247+
assert "Retrying with fuzziness=0" in caplog.text
248+
249+
250+
@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch")
251+
def test_bm25_retrieval_no_retry_when_fuzziness_already_zero(_mock_opensearch_client):
252+
store = OpenSearchDocumentStore(hosts="testhost")
253+
store._client = MagicMock()
254+
255+
too_many_clauses_error = TransportError(
256+
500, "search_phase_execution_exception", "too_many_clauses: maxClauseCount is set to 1024"
257+
)
258+
store._client.search.side_effect = too_many_clauses_error
259+
260+
with pytest.raises(TransportError):
261+
store._bm25_retrieval("a very long query", fuzziness=0)
262+
263+
assert store._client.search.call_count == 1
264+
265+
266+
@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch")
267+
def test_bm25_retrieval_no_retry_with_custom_query(_mock_opensearch_client):
268+
store = OpenSearchDocumentStore(hosts="testhost")
269+
store._client = MagicMock()
270+
271+
too_many_clauses_error = TransportError(
272+
500, "search_phase_execution_exception", "too_many_clauses: maxClauseCount is set to 1024"
273+
)
274+
store._client.search.side_effect = too_many_clauses_error
275+
276+
custom_query = {"query": {"match": {"content": "$query"}}}
277+
with pytest.raises(TransportError):
278+
store._bm25_retrieval("a very long query", fuzziness="AUTO", custom_query=custom_query)
279+
280+
assert store._client.search.call_count == 1
281+
282+
283+
@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch")
284+
def test_bm25_retrieval_reraises_other_transport_errors(_mock_opensearch_client):
285+
store = OpenSearchDocumentStore(hosts="testhost")
286+
store._client = MagicMock()
287+
288+
other_error = TransportError(500, "parsing_exception", {"error": {"reason": "some other error"}})
289+
store._client.search.side_effect = other_error
290+
291+
with pytest.raises(TransportError):
292+
store._bm25_retrieval("some query", fuzziness="AUTO")
293+
294+
assert store._client.search.call_count == 1
295+
296+
297+
@pytest.mark.asyncio
298+
@patch("haystack_integrations.document_stores.opensearch.document_store.AsyncOpenSearch")
299+
@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch")
300+
async def test_bm25_retrieval_async_retries_with_fuzziness_zero_on_too_many_clauses(
301+
_mock_opensearch_client, _mock_async_client, caplog
302+
):
303+
store = OpenSearchDocumentStore(hosts="testhost")
304+
store._async_client = AsyncMock()
305+
306+
too_many_clauses_error = TransportError(
307+
500, "search_phase_execution_exception", "too_many_clauses: maxClauseCount is set to 1024"
308+
)
309+
store._async_client.search.side_effect = [
310+
too_many_clauses_error,
311+
{"hits": {"hits": []}},
312+
]
313+
314+
results = await store._bm25_retrieval_async("a very long query", fuzziness="AUTO")
315+
316+
assert results == []
317+
assert store._async_client.search.call_count == 2
318+
second_call_body = store._async_client.search.call_args_list[1].kwargs["body"]
319+
assert second_call_body["query"]["bool"]["must"][0]["multi_match"]["fuzziness"] == 0
320+
assert "Retrying with fuzziness=0" in caplog.text
321+
322+
323+
@pytest.mark.asyncio
324+
@patch("haystack_integrations.document_stores.opensearch.document_store.AsyncOpenSearch")
325+
@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch")
326+
async def test_bm25_retrieval_async_no_retry_when_fuzziness_already_zero(_mock_opensearch_client, _mock_async_client):
327+
store = OpenSearchDocumentStore(hosts="testhost")
328+
store._async_client = AsyncMock()
329+
330+
too_many_clauses_error = TransportError(
331+
500, "search_phase_execution_exception", "too_many_clauses: maxClauseCount is set to 1024"
332+
)
333+
store._async_client.search.side_effect = too_many_clauses_error
334+
335+
with pytest.raises(TransportError):
336+
await store._bm25_retrieval_async("a very long query", fuzziness=0)
337+
338+
assert store._async_client.search.call_count == 1
339+
340+
341+
@pytest.mark.asyncio
342+
@patch("haystack_integrations.document_stores.opensearch.document_store.AsyncOpenSearch")
343+
@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch")
344+
async def test_bm25_retrieval_async_no_retry_with_custom_query(_mock_opensearch_client, _mock_async_client):
345+
store = OpenSearchDocumentStore(hosts="testhost")
346+
store._async_client = AsyncMock()
347+
348+
too_many_clauses_error = TransportError(
349+
500, "search_phase_execution_exception", "too_many_clauses: maxClauseCount is set to 1024"
350+
)
351+
store._async_client.search.side_effect = too_many_clauses_error
352+
353+
custom_query = {"query": {"match": {"content": "$query"}}}
354+
with pytest.raises(TransportError):
355+
await store._bm25_retrieval_async("a very long query", fuzziness="AUTO", custom_query=custom_query)
356+
357+
assert store._async_client.search.call_count == 1
358+
359+
360+
@pytest.mark.asyncio
361+
@patch("haystack_integrations.document_stores.opensearch.document_store.AsyncOpenSearch")
362+
@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch")
363+
async def test_bm25_retrieval_async_reraises_other_transport_errors(_mock_opensearch_client, _mock_async_client):
364+
store = OpenSearchDocumentStore(hosts="testhost")
365+
store._async_client = AsyncMock()
366+
367+
other_error = TransportError(500, "parsing_exception", {"error": {"reason": "some other error"}})
368+
store._async_client.search.side_effect = other_error
369+
370+
with pytest.raises(TransportError):
371+
await store._bm25_retrieval_async("some query", fuzziness="AUTO")
372+
373+
assert store._async_client.search.call_count == 1
374+
375+
227376
@pytest.mark.integration
228377
class TestDocumentStore(
229378
CountDocumentsByFilterTest,
@@ -333,6 +482,33 @@ def test_bm25_retrieval_with_fuzziness(
333482
assert "functional" in res[1].content
334483
assert "functional" in res[2].content
335484

485+
def test_bm25_retrieval_with_fuzziness_overflow(self, document_store: OpenSearchDocumentStore, caplog):
486+
"""
487+
Test that a long query with fuzziness="AUTO" that exceeds OpenSearch's maxClauseCount
488+
is automatically retried with fuzziness=0 instead of raising an error.
489+
"""
490+
# Build an index vocabulary of similar 5-character words. With fuzziness="AUTO",
491+
# 5-char words get edit distance 1, so each query term fuzzy-matches many similar
492+
# indexed terms, causing clause expansion beyond the default maxClauseCount (1024).
493+
# With fuzziness=0, each term produces exactly 1 clause, staying well under the limit.
494+
words = [f"foo{chr(97 + i)}{chr(97 + j)}" for i in range(20) for j in range(26)] # 520 words
495+
496+
chunk_size = 52
497+
docs = [
498+
Document(content=" ".join(words[i : i + chunk_size]), id=str(idx))
499+
for idx, i in enumerate(range(0, len(words), chunk_size))
500+
]
501+
document_store.write_documents(docs)
502+
503+
# Query with a subset of words. With fuzziness="AUTO", each 5-char term expands
504+
# to match ~45 similar indexed terms, pushing total clauses well above 1024.
505+
long_query = " ".join(words[:100])
506+
507+
# This should not raise: the too_many_clauses error is caught and retried with fuzziness=0
508+
res = document_store._bm25_retrieval(long_query, top_k=3, fuzziness="AUTO")
509+
assert isinstance(res, list)
510+
assert "Retrying with fuzziness=0" in caplog.text
511+
336512
def test_bm25_retrieval_with_filters(self, document_store: OpenSearchDocumentStore, test_documents: list[Document]):
337513
document_store.write_documents(test_documents)
338514
res = document_store._bm25_retrieval(

0 commit comments

Comments
 (0)