Skip to content

Commit 3b0a2d7

Browse files
authored
chore: hanlp - avoid mutating documents and add tests (#3206)
1 parent 4b475ce commit 3b0a2d7

2 files changed

Lines changed: 143 additions & 37 deletions

File tree

integrations/hanlp/src/haystack_integrations/components/preprocessors/hanlp/chinese_document_splitter.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from collections.abc import Callable
66
from copy import deepcopy
7+
from dataclasses import replace
78
from typing import Any, Literal
89

910
from haystack import Document, component, logging
@@ -440,10 +441,7 @@ def _create_docs_from_splits(
440441
previous_doc_start_idx = splits_start_idxs[i - 1]
441442
self._add_split_overlap_information(doc, doc_start_idx, previous_doc, previous_doc_start_idx)
442443

443-
for d in documents:
444-
if d.content is not None:
445-
d.content = d.content.replace(" ", "")
446-
return documents
444+
return [replace(d, content=d.content.replace(" ", "")) if d.content is not None else d for d in documents]
447445

448446
@staticmethod
449447
def _add_split_overlap_information(

integrations/hanlp/tests/test_chinese_document_splitter.py

Lines changed: 141 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5+
from unittest.mock import MagicMock, patch
6+
57
import pytest
68
from haystack import Document
79
from haystack.utils import deserialize_callable, serialize_callable
@@ -90,6 +92,145 @@ def test_from_dict_with_splitting_function(self):
9092
assert callable(splitter.splitting_function)
9193
assert splitter.splitting_function("a.b.c") == ["a", "b", "c"]
9294

95+
def test_validate_init_parameters(self):
96+
ChineseDocumentSplitter._validate_init_parameters(
97+
split_by="word",
98+
split_length=1000,
99+
split_overlap=200,
100+
split_threshold=0,
101+
granularity="coarse",
102+
)
103+
104+
with pytest.raises(ValueError, match="split_length must be positive"):
105+
ChineseDocumentSplitter._validate_init_parameters(split_length=0)
106+
107+
with pytest.raises(ValueError, match="split_overlap must be non-negative"):
108+
ChineseDocumentSplitter._validate_init_parameters(split_overlap=-1)
109+
110+
with pytest.raises(ValueError, match="split_overlap must be less than split_length"):
111+
ChineseDocumentSplitter._validate_init_parameters(split_overlap=1000, split_length=500)
112+
113+
with pytest.raises(ValueError, match="split_threshold must be non-negative"):
114+
ChineseDocumentSplitter._validate_init_parameters(split_threshold=-1)
115+
116+
with pytest.raises(ValueError, match="split_threshold must be less than split_length"):
117+
ChineseDocumentSplitter._validate_init_parameters(split_threshold=1001, split_length=1000)
118+
119+
with pytest.raises(
120+
ValueError,
121+
match="split_by must be one of 'word', 'sentence', 'passage', 'page', 'line', 'period', 'function'",
122+
):
123+
ChineseDocumentSplitter._validate_init_parameters(split_by="invalid")
124+
125+
with pytest.raises(ValueError, match="granularity must be one of 'coarse', 'fine'"):
126+
ChineseDocumentSplitter._validate_init_parameters(granularity="invalid")
127+
128+
def test_split_by_character_returns_empty_for_none_content(self):
129+
splitter = ChineseDocumentSplitter(split_by="word")
130+
assert splitter._split_by_character(Document(content=None)) == []
131+
132+
def test_split_by_hanlp_sentence_returns_empty_for_none_content(self):
133+
splitter = ChineseDocumentSplitter(split_by="sentence")
134+
assert splitter._split_by_hanlp_sentence(Document(content=None)) == []
135+
136+
def test_split_document_dispatches_to_function_splitter(self):
137+
splitter = ChineseDocumentSplitter(split_by="function", splitting_function=custom_split)
138+
doc = Document(content="a.b.c")
139+
result = splitter._split_document(doc)
140+
assert [d.content for d in result] == ["a", "b", "c"]
141+
142+
def test_split_by_function_returns_empty_for_none_content(self):
143+
splitter = ChineseDocumentSplitter(split_by="function", splitting_function=custom_split)
144+
assert splitter._split_by_function(Document(content=None)) == []
145+
146+
def test_split_by_function_raises_when_no_function_provided(self):
147+
splitter = ChineseDocumentSplitter(split_by="function", splitting_function=custom_split)
148+
splitter.splitting_function = None
149+
with pytest.raises(ValueError, match=r"No splitting function provided\."):
150+
splitter._split_by_function(Document(content="a.b"))
151+
152+
def test_split_by_function_raises_when_function_returns_non_list(self):
153+
splitter = ChineseDocumentSplitter(split_by="function", splitting_function=lambda t: "not a list")
154+
with pytest.raises(ValueError, match="must return a list of strings"):
155+
splitter._split_by_function(Document(content="a.b"))
156+
157+
def test_split_by_function_produces_documents_with_metadata(self):
158+
splitter = ChineseDocumentSplitter(split_by="function", splitting_function=custom_split)
159+
doc = Document(content="a.b.c", meta={"name": "src"})
160+
result = splitter._split_by_function(doc)
161+
assert len(result) == 3
162+
assert [d.content for d in result] == ["a", "b", "c"]
163+
assert all(d.meta["source_id"] == doc.id for d in result)
164+
assert all(d.meta["name"] == "src" for d in result)
165+
assert [d.meta["split_id"] for d in result] == [0, 1, 2]
166+
167+
@pytest.mark.parametrize(
168+
("previous_content", "current_content"),
169+
[(None, "abc"), ("abc", None)],
170+
)
171+
def test_add_split_overlap_information_skips_when_content_is_none(self, previous_content, current_content):
172+
previous_doc = Document(content=previous_content, meta={"_split_overlap": []})
173+
current_doc = Document(content=current_content, meta={"_split_overlap": []})
174+
ChineseDocumentSplitter._add_split_overlap_information(current_doc, 0, previous_doc, 0)
175+
assert current_doc.meta["_split_overlap"] == []
176+
assert previous_doc.meta["_split_overlap"] == []
177+
178+
def test_add_split_overlap_information_skips_when_no_matching_prefix(self):
179+
previous_doc = Document(content="abcdef", meta={"_split_overlap": []})
180+
current_doc = Document(content="xyz", meta={"_split_overlap": []})
181+
ChineseDocumentSplitter._add_split_overlap_information(current_doc, 2, previous_doc, 0)
182+
assert current_doc.meta["_split_overlap"] == []
183+
assert previous_doc.meta["_split_overlap"] == []
184+
185+
@pytest.mark.parametrize("elements", [[], ["", " ", "\t"]])
186+
def test_concatenate_units_returns_empty_for_empty_or_whitespace(self, elements):
187+
splitter = ChineseDocumentSplitter(split_by="word", split_length=5, split_overlap=1)
188+
assert splitter._concatenate_units(elements, split_length=5, split_overlap=1, split_threshold=0) == ([], [], [])
189+
190+
def test_concatenate_units_returns_single_chunk_for_short_input(self):
191+
splitter = ChineseDocumentSplitter(split_by="word", split_length=10, split_overlap=0)
192+
text_splits, pages, start_idxs = splitter._concatenate_units(
193+
["a", "b", "c"], split_length=10, split_overlap=0, split_threshold=0
194+
)
195+
assert text_splits == ["abc"]
196+
assert pages == [1]
197+
assert start_idxs == [0]
198+
199+
def test_concatenate_units_applies_split_threshold(self):
200+
splitter = ChineseDocumentSplitter(split_by="word", split_length=2, split_overlap=0)
201+
text_splits, _, _ = splitter._concatenate_units(
202+
["a", "b", "c"], split_length=2, split_overlap=0, split_threshold=2
203+
)
204+
# the trailing "c" chunk is smaller than split_threshold=2 and gets attached to previous
205+
assert text_splits == ["abc"]
206+
207+
def test_number_of_sentences_to_keep_returns_zero_when_overlap_is_zero(self):
208+
splitter = ChineseDocumentSplitter(split_by="word", split_length=10, split_overlap=0)
209+
assert splitter._number_of_sentences_to_keep(["s1", "s2", "s3"], split_length=10, split_overlap=0) == 0
210+
211+
def test_number_of_sentences_to_keep_stops_when_overlap_reached(self):
212+
splitter = ChineseDocumentSplitter(split_by="word", split_length=10, split_overlap=2)
213+
# one word per sentence, so we need at least 3 sentences to exceed split_overlap=2
214+
splitter.chinese_tokenizer = MagicMock(side_effect=lambda s: [s])
215+
result = splitter._number_of_sentences_to_keep(["s1", "s2", "s3", "s4"], split_length=10, split_overlap=2)
216+
assert result == 3
217+
218+
def test_number_of_sentences_to_keep_stops_when_split_length_exceeded(self):
219+
splitter = ChineseDocumentSplitter(split_by="word", split_length=2, split_overlap=1)
220+
# each sentence tokenizes to 5 "words", so first iteration already exceeds split_length
221+
splitter.chinese_tokenizer = MagicMock(return_value=["a", "b", "c", "d", "e"])
222+
assert splitter._number_of_sentences_to_keep(["s1", "s2", "s3"], split_length=2, split_overlap=1) == 0
223+
224+
def test_warm_up_is_idempotent(self):
225+
splitter = ChineseDocumentSplitter(split_by="word")
226+
with patch(
227+
"haystack_integrations.components.preprocessors.hanlp.chinese_document_splitter.hanlp"
228+
) as mock_hanlp:
229+
mock_hanlp.load.return_value = MagicMock()
230+
splitter.warm_up()
231+
splitter.warm_up()
232+
assert mock_hanlp.load.call_count == 2 # once for tokenizer, once for split_sent
233+
93234
@pytest.mark.integration
94235
def test_empty_list(self):
95236
splitter = ChineseDocumentSplitter()
@@ -216,36 +357,3 @@ def has_any_overlap(suffix: str, prefix: str) -> bool:
216357
f"Chunks {i} and {i + 1} do not overlap. "
217358
f"Tail (up to 20 chars): '{overlap_prev}' vs Head (up to 20 chars): '{overlap_curr}'"
218359
)
219-
220-
def test_validate_init_parameters(self):
221-
ChineseDocumentSplitter._validate_init_parameters(
222-
split_by="word",
223-
split_length=1000,
224-
split_overlap=200,
225-
split_threshold=0,
226-
granularity="coarse",
227-
)
228-
229-
with pytest.raises(ValueError, match="split_length must be positive"):
230-
ChineseDocumentSplitter._validate_init_parameters(split_length=0)
231-
232-
with pytest.raises(ValueError, match="split_overlap must be non-negative"):
233-
ChineseDocumentSplitter._validate_init_parameters(split_overlap=-1)
234-
235-
with pytest.raises(ValueError, match="split_overlap must be less than split_length"):
236-
ChineseDocumentSplitter._validate_init_parameters(split_overlap=1000, split_length=500)
237-
238-
with pytest.raises(ValueError, match="split_threshold must be non-negative"):
239-
ChineseDocumentSplitter._validate_init_parameters(split_threshold=-1)
240-
241-
with pytest.raises(ValueError, match="split_threshold must be less than split_length"):
242-
ChineseDocumentSplitter._validate_init_parameters(split_threshold=1001, split_length=1000)
243-
244-
with pytest.raises(
245-
ValueError,
246-
match="split_by must be one of 'word', 'sentence', 'passage', 'page', 'line', 'period', 'function'",
247-
):
248-
ChineseDocumentSplitter._validate_init_parameters(split_by="invalid")
249-
250-
with pytest.raises(ValueError, match="granularity must be one of 'coarse', 'fine'"):
251-
ChineseDocumentSplitter._validate_init_parameters(granularity="invalid")

0 commit comments

Comments
 (0)