fix: RecursiveSplitter bug in the case when the recursive chunking is triggered (#9316)

davidsbatista · web-flow · commit 201becd40040 · 2025-04-30T13:03:23.000+02:00
* initial import

* adding release notes

* Update fixing-bug-recursive-splitter-88d5714529f84e4e.yaml
diff --git a/haystack/components/preprocessors/recursive_splitter.py b/haystack/components/preprocessors/recursive_splitter.py
@@ -336,7 +336,6 @@ def _chunk_text(self, text: str) -> List[str]:
                             chunks.extend(fall_back_chunks)
                         else:
                             chunks.extend(self._chunk_text(split_text))
-                        current_length += self._chunk_length(split_text)
 
                     else:
                         current_chunk.append(split_text)
diff --git a/releasenotes/notes/fixing-bug-recursive-splitter-88d5714529f84e4e.yaml b/releasenotes/notes/fixing-bug-recursive-splitter-88d5714529f84e4e.yaml
@@ -0,0 +1,5 @@
+---
+
+fixes:
+  - |
+     A bug in the `RecursiveDocumentSplitter` was fixed for the case where a `split_text` is longer than the `split_length` and recursive chunking is triggered.
diff --git a/test/components/preprocessors/test_recursive_splitter.py b/test/components/preprocessors/test_recursive_splitter.py
@@ -426,7 +426,7 @@ def test_run_separator_exists_but_split_length_too_small_fall_back_to_character_
     result = splitter.run(documents=[doc])
     assert len(result["documents"]) == 10
     for doc in result["documents"]:
-        if re.escape(doc.content) not in ["\ "]:
+        if re.escape(doc.content) not in ["\\ "]:
             assert len(doc.content) == 2
 
 
@@ -944,3 +944,45 @@ def test_run_without_warm_up_raises_error():
     result = splitter_no_warmup.run(docs)
     assert len(result["documents"]) == 1
     assert result["documents"][0].content == "text"
+
+
+def test_run_complex_text_with_multiple_separators():
+    """
+    Test that RecursiveDocumentSplitter correctly handles complex text with multiple separators and chunks that exceed
+    the split_length.
+    """
+    # Create a complex text with multiple separators and chunks of different sizes
+    long_text = (
+        "A" * 150
+        + "\n\n"  # triggers first-level split on \n\n
+        + "B" * 100
+        + "\n"
+        + "B" * 105
+        + "\n\n"  # this chunk exceeds split_length and goes through recursion
+        + "C" * 100
+        + "\n\n"  # short chunk1
+        + "D" * 50  # short chunk2
+    )
+
+    doc = Document(content=long_text)
+    splitter = RecursiveDocumentSplitter(
+        split_length=200, split_overlap=0, split_unit="char", separators=["\n\n", "\n", " "]
+    )
+    splitter.warm_up()
+    result = splitter.run([doc])
+    chunks = result["documents"]
+
+    assert len(chunks) == 4
+
+    assert len(chunks[0].content) == 152
+    assert chunks[0].content.startswith("A")
+
+    assert len(chunks[1].content) == 101
+    assert chunks[1].content.startswith("B")
+
+    assert len(chunks[2].content) == 107
+    assert chunks[2].content.startswith("B")
+
+    assert len(chunks[3].content) == 152
+    assert chunks[3].content.startswith("C")
+    assert chunks[3].content.endswith("D" * 50)

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
++
 +fixes:
 +  - |
 +     A bug in the `RecursiveDocumentSplitter` was fixed for the case where a `split_text` is longer than the `split_length` and recursive chunking is triggered.