@@ -426,7 +426,7 @@ def test_run_separator_exists_but_split_length_too_small_fall_back_to_character_
426426 result = splitter .run (documents = [doc ])
427427 assert len (result ["documents" ]) == 10
428428 for doc in result ["documents" ]:
429- if re .escape (doc .content ) not in ["\ " ]:
429+ if re .escape (doc .content ) not in ["\\ " ]:
430430 assert len (doc .content ) == 2
431431
432432
@@ -944,3 +944,45 @@ def test_run_without_warm_up_raises_error():
944944 result = splitter_no_warmup .run (docs )
945945 assert len (result ["documents" ]) == 1
946946 assert result ["documents" ][0 ].content == "text"
947+
948+
949+ def test_run_complex_text_with_multiple_separators ():
950+ """
951+ Test that RecursiveDocumentSplitter correctly handles complex text with multiple separators and chunks that exceed
952+ the split_length.
953+ """
954+ # Create a complex text with multiple separators and chunks of different sizes
955+ long_text = (
956+ "A" * 150
957+ + "\n \n " # triggers first-level split on \n\n
958+ + "B" * 100
959+ + "\n "
960+ + "B" * 105
961+ + "\n \n " # this chunk exceeds split_length and goes through recursion
962+ + "C" * 100
963+ + "\n \n " # short chunk1
964+ + "D" * 50 # short chunk2
965+ )
966+
967+ doc = Document (content = long_text )
968+ splitter = RecursiveDocumentSplitter (
969+ split_length = 200 , split_overlap = 0 , split_unit = "char" , separators = ["\n \n " , "\n " , " " ]
970+ )
971+ splitter .warm_up ()
972+ result = splitter .run ([doc ])
973+ chunks = result ["documents" ]
974+
975+ assert len (chunks ) == 4
976+
977+ assert len (chunks [0 ].content ) == 152
978+ assert chunks [0 ].content .startswith ("A" )
979+
980+ assert len (chunks [1 ].content ) == 101
981+ assert chunks [1 ].content .startswith ("B" )
982+
983+ assert len (chunks [2 ].content ) == 107
984+ assert chunks [2 ].content .startswith ("B" )
985+
986+ assert len (chunks [3 ].content ) == 152
987+ assert chunks [3 ].content .startswith ("C" )
988+ assert chunks [3 ].content .endswith ("D" * 50 )
0 commit comments