Skip to content

Commit fbf0a82

Browse files
fix: avoid ZeroDivisionError in BM25 retrieval on a tokenless corpus (#11619)
Co-authored-by: Eric (GabiDevFamily) <271972409+santino18727-debug@users.noreply.github.com> Co-authored-by: bogdankostic <bogdankostic@web.de>
1 parent 9eed907 commit fbf0a82

3 files changed

Lines changed: 38 additions & 2 deletions

File tree

haystack/document_stores/in_memory/document_store.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -681,7 +681,16 @@ def bm25_retrieval(
681681
logger.info("No documents found for BM25 retrieval. Returning empty list.")
682682
return []
683683

684-
results = sorted(self.bm25_algorithm_inst(query, all_documents), key=lambda x: x[1], reverse=True)[:top_k]
684+
# A tokenless corpus (every stored document has empty content) has no vocabulary and an
685+
# average document length of zero, which would make all three BM25 algorithms divide by
686+
# zero during scoring. Score every candidate as 0.0 instead; the non-positive-score
687+
# handling below then keeps them for BM25Okapi (unscaled) and drops them otherwise.
688+
if self._avg_doc_len == 0:
689+
scored_documents = [(doc, 0.0) for doc in all_documents]
690+
else:
691+
scored_documents = self.bm25_algorithm_inst(query, all_documents)
692+
693+
results = sorted(scored_documents, key=lambda x: x[1], reverse=True)[:top_k]
685694

686695
# BM25Okapi can return meaningful negative values, so they should not be filtered out when scale_score is False.
687696
# It's the only algorithm supported by rank_bm25 at the time of writing (2024) that can return negative scores.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
fixes:
3+
- |
4+
Fixed a ``ZeroDivisionError`` in ``InMemoryDocumentStore.bm25_retrieval`` that occurred at query
5+
time when every stored document had empty (but not ``None``) content. Such a tokenless corpus
6+
has no vocabulary and an average document length of zero, which made all three BM25
7+
algorithms divide by zero. Retrieval now scores every candidate as ``0.0`` in this case:
8+
unscaled ``BM25Okapi`` returns the documents with score ``0.0``, while ``BM25L`` and ``BM25Plus``
9+
return an empty list (non-positive scores are filtered). Corpora with at least one non-empty
10+
document are unaffected.

test/document_stores/test_in_memory.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import logging
88
import math
99
import tempfile
10-
from typing import cast
10+
from typing import Literal, cast
1111
from unittest.mock import patch
1212

1313
import pytest
@@ -174,6 +174,23 @@ def test_bm25_retrieval_with_empty_document_store(
174174
assert len(results) == 0
175175
assert "No documents found for BM25 retrieval. Returning empty list." in caplog.text
176176

177+
@pytest.mark.parametrize("bm25_algorithm", ["BM25Okapi", "BM25L", "BM25Plus"])
178+
def test_bm25_retrieval_with_tokenless_corpus(
179+
self, bm25_algorithm: Literal["BM25Okapi", "BM25L", "BM25Plus"]
180+
) -> None:
181+
# Regression test for #11598: a corpus where every document has empty (but not None)
182+
# content must not raise ZeroDivisionError at query time.
183+
store = InMemoryDocumentStore(bm25_algorithm=bm25_algorithm)
184+
store.write_documents([Document(content="", meta={"i": 1}), Document(content="", meta={"i": 2})])
185+
results = store.bm25_retrieval(query="anything")
186+
if bm25_algorithm == "BM25Okapi":
187+
# Unscaled BM25Okapi keeps non-positive scores, so documents are returned with score 0.0.
188+
assert len(results) == 2
189+
assert all(doc.score == 0.0 for doc in results)
190+
else:
191+
# BM25L / BM25Plus filter out non-positive scores.
192+
assert results == []
193+
177194
def test_bm25_retrieval_empty_query(self, document_store: InMemoryDocumentStore) -> None:
178195
# Tests if the bm25_retrieval method returns a document when the query is an empty string.
179196
docs = [Document(content="Hello world"), Document(content="Haystack supports multiple languages")]

0 commit comments

Comments
 (0)