Skip to content

Commit 201becd

Browse files
fix: RecursiveSplitter bug in the case when the recursive chunking is triggered (#9316)
* initial import * adding release notes * Update fixing-bug-recursive-splitter-88d5714529f84e4e.yaml
1 parent f8eead3 commit 201becd

3 files changed

Lines changed: 48 additions & 2 deletions

File tree

haystack/components/preprocessors/recursive_splitter.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,6 @@ def _chunk_text(self, text: str) -> List[str]:
336336
chunks.extend(fall_back_chunks)
337337
else:
338338
chunks.extend(self._chunk_text(split_text))
339-
current_length += self._chunk_length(split_text)
340339

341340
else:
342341
current_chunk.append(split_text)
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
3+
fixes:
4+
- |
5+
A bug in the `RecursiveDocumentSplitter` was fixed for the case where a `split_text` is longer than the `split_length` and recursive chunking is triggered.

test/components/preprocessors/test_recursive_splitter.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,7 @@ def test_run_separator_exists_but_split_length_too_small_fall_back_to_character_
426426
result = splitter.run(documents=[doc])
427427
assert len(result["documents"]) == 10
428428
for doc in result["documents"]:
429-
if re.escape(doc.content) not in ["\ "]:
429+
if re.escape(doc.content) not in ["\\ "]:
430430
assert len(doc.content) == 2
431431

432432

@@ -944,3 +944,45 @@ def test_run_without_warm_up_raises_error():
944944
result = splitter_no_warmup.run(docs)
945945
assert len(result["documents"]) == 1
946946
assert result["documents"][0].content == "text"
947+
948+
949+
def test_run_complex_text_with_multiple_separators():
950+
"""
951+
Test that RecursiveDocumentSplitter correctly handles complex text with multiple separators and chunks that exceed
952+
the split_length.
953+
"""
954+
# Create a complex text with multiple separators and chunks of different sizes
955+
long_text = (
956+
"A" * 150
957+
+ "\n\n" # triggers first-level split on \n\n
958+
+ "B" * 100
959+
+ "\n"
960+
+ "B" * 105
961+
+ "\n\n" # this chunk exceeds split_length and goes through recursion
962+
+ "C" * 100
963+
+ "\n\n" # short chunk1
964+
+ "D" * 50 # short chunk2
965+
)
966+
967+
doc = Document(content=long_text)
968+
splitter = RecursiveDocumentSplitter(
969+
split_length=200, split_overlap=0, split_unit="char", separators=["\n\n", "\n", " "]
970+
)
971+
splitter.warm_up()
972+
result = splitter.run([doc])
973+
chunks = result["documents"]
974+
975+
assert len(chunks) == 4
976+
977+
assert len(chunks[0].content) == 152
978+
assert chunks[0].content.startswith("A")
979+
980+
assert len(chunks[1].content) == 101
981+
assert chunks[1].content.startswith("B")
982+
983+
assert len(chunks[2].content) == 107
984+
assert chunks[2].content.startswith("B")
985+
986+
assert len(chunks[3].content) == 152
987+
assert chunks[3].content.startswith("C")
988+
assert chunks[3].content.endswith("D" * 50)

0 commit comments

Comments
 (0)