Skip to content

Commit 05ed5f6

Browse files
committed
fix(test): use real Documents and correct lengths for splitter merge validation
1 parent 1029da0 commit 05ed5f6

1 file changed

Lines changed: 13 additions & 16 deletions

File tree

tests/test_splitter.py

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -31,20 +31,15 @@ def test_semantic_double_merging_logic(mock_embed_model, mocker):
3131
min_chunk_size=100
3232
)
3333

34-
# Mocking the base class behavior (Pass 1)
35-
# We simulate that the base splitter returned 3 nodes: two small ones and one normal
36-
node1 = MagicMock(spec=Document)
37-
node1.get_content.return_value = "Short text." # 11 chars
38-
39-
node2 = MagicMock(spec=Document)
40-
node2.get_content.return_value = "Another short bit." # 18 chars
41-
42-
node3 = MagicMock(spec=Document)
43-
node3.get_content.return_value = "This is a much longer text that should exceed the minimum chunk size threshold for merging logic." # ~100 chars
34+
# Use real Document objects to ensure content updates (set_content) work as expected
35+
node1 = Document(text="A" * 60) # 60 chars
36+
node2 = Document(text="B" * 50) # 50 chars. node1 + node2 = 110 (> 100)
37+
node3 = Document(text="C" * 110) # 110 chars. Already big enough.
4438

45-
# Inject mock into super().get_nodes_from_documents via mocker
39+
# Inject mock into the base class method.
40+
# We patch it in the splitter module where it's imported.
4641
mocker.patch(
47-
'llama_index.core.node_parser.SemanticSplitterNodeParser.get_nodes_from_documents',
42+
'src.processing.splitter.SemanticSplitterNodeParser.get_nodes_from_documents',
4843
return_value=[node1, node2, node3]
4944
)
5045

@@ -53,11 +48,13 @@ def test_semantic_double_merging_logic(mock_embed_model, mocker):
5348
nodes = list(splitter.get_nodes_generator([doc]))
5449

5550
# Validation
56-
# node1 and node2 should have been merged because len(node1) < 100
57-
# Final result should have 2 nodes (Merge of 1+2 and the isolated 3)
51+
# node1 and node2 should have been merged because len(node1) < 100.
52+
# After merging node2, node1's length becomes 111 (60 + 1 (newline) + 50).
53+
# Since 111 > 100, the next iteration (node3) will yield the merged node1 and start fresh.
54+
# Final result should have 2 nodes: (node1+node2) and (node3)
5855
assert len(nodes) == 2
59-
assert "Short text." in nodes[0].get_content()
60-
assert "Another short bit." in nodes[0].get_content()
56+
assert "A" * 60 in nodes[0].get_content()
57+
assert "B" * 50 in nodes[0].get_content()
6158
assert nodes[1].get_content() == node3.get_content()
6259

6360
def test_generator_memory_efficiency(mock_embed_model):

0 commit comments

Comments
 (0)