fix: avoid ZeroDivisionError in BM25 retrieval on a tokenless corpus (#11619)

santino18727-debug · bogdankostic · web-flow · commit fbf0a8205747 · 2026-06-16T11:52:12.000+02:00
Co-authored-by: Eric (GabiDevFamily) &lt;271972409+santino18727-debug@users.noreply.github.com&gt;
Co-authored-by: bogdankostic &lt;bogdankostic@web.de&gt;
diff --git a/haystack/document_stores/in_memory/document_store.py b/haystack/document_stores/in_memory/document_store.py
@@ -681,7 +681,16 @@ def bm25_retrieval(
             logger.info("No documents found for BM25 retrieval. Returning empty list.")
             return []
 
-        results = sorted(self.bm25_algorithm_inst(query, all_documents), key=lambda x: x[1], reverse=True)[:top_k]
+        # A tokenless corpus (every stored document has empty content) has no vocabulary and an
+        # average document length of zero, which would make all three BM25 algorithms divide by
+        # zero during scoring. Score every candidate as 0.0 instead; the non-positive-score
+        # handling below then keeps them for BM25Okapi (unscaled) and drops them otherwise.
+        if self._avg_doc_len == 0:
+            scored_documents = [(doc, 0.0) for doc in all_documents]
+        else:
+            scored_documents = self.bm25_algorithm_inst(query, all_documents)
+
+        results = sorted(scored_documents, key=lambda x: x[1], reverse=True)[:top_k]
 
         # BM25Okapi can return meaningful negative values, so they should not be filtered out when scale_score is False.
         # It's the only algorithm supported by rank_bm25 at the time of writing (2024) that can return negative scores.
diff --git a/releasenotes/notes/fix-bm25-empty-corpus-zerodivision-d4e5f60718293041.yaml b/releasenotes/notes/fix-bm25-empty-corpus-zerodivision-d4e5f60718293041.yaml
@@ -0,0 +1,10 @@
+---
+fixes:
+  - |
+    Fixed a ``ZeroDivisionError`` in ``InMemoryDocumentStore.bm25_retrieval`` that occurred at query
+    time when every stored document had empty (but not ``None``) content. Such a tokenless corpus
+    has no vocabulary and an average document length of zero, which made all three BM25
+    algorithms divide by zero. Retrieval now scores every candidate as ``0.0`` in this case:
+    unscaled ``BM25Okapi`` returns the documents with score ``0.0``, while ``BM25L`` and ``BM25Plus``
+    return an empty list (non-positive scores are filtered). Corpora with at least one non-empty
+    document are unaffected.
diff --git a/test/document_stores/test_in_memory.py b/test/document_stores/test_in_memory.py
@@ -7,7 +7,7 @@
 import logging
 import math
 import tempfile
-from typing import cast
+from typing import Literal, cast
 from unittest.mock import patch
 
 import pytest
@@ -174,6 +174,23 @@ def test_bm25_retrieval_with_empty_document_store(
         assert len(results) == 0
         assert "No documents found for BM25 retrieval. Returning empty list." in caplog.text
 
+    @pytest.mark.parametrize("bm25_algorithm", ["BM25Okapi", "BM25L", "BM25Plus"])
+    def test_bm25_retrieval_with_tokenless_corpus(
+        self, bm25_algorithm: Literal["BM25Okapi", "BM25L", "BM25Plus"]
+    ) -> None:
+        # Regression test for #11598: a corpus where every document has empty (but not None)
+        # content must not raise ZeroDivisionError at query time.
+        store = InMemoryDocumentStore(bm25_algorithm=bm25_algorithm)
+        store.write_documents([Document(content="", meta={"i": 1}), Document(content="", meta={"i": 2})])
+        results = store.bm25_retrieval(query="anything")
+        if bm25_algorithm == "BM25Okapi":
+            # Unscaled BM25Okapi keeps non-positive scores, so documents are returned with score 0.0.
+            assert len(results) == 2
+            assert all(doc.score == 0.0 for doc in results)
+        else:
+            # BM25L / BM25Plus filter out non-positive scores.
+            assert results == []
+
     def test_bm25_retrieval_empty_query(self, document_store: InMemoryDocumentStore) -> None:
         # Tests if the bm25_retrieval method returns a document when the query is an empty string.
         docs = [Document(content="Hello world"), Document(content="Haystack supports multiple languages")]