diff --git a/haystack/components/preprocessors/__init__.py b/haystack/components/preprocessors/__init__.py index cdbdf4a7a2..330bc8e05f 100644 --- a/haystack/components/preprocessors/__init__.py +++ b/haystack/components/preprocessors/__init__.py @@ -8,6 +8,7 @@ from lazy_imports import LazyImporter _import_structure = { + "chinese_document_splitter": ["ChineseDocumentSplitter"], "csv_document_cleaner": ["CSVDocumentCleaner"], "csv_document_splitter": ["CSVDocumentSplitter"], "document_cleaner": ["DocumentCleaner"], @@ -19,6 +20,7 @@ } if TYPE_CHECKING: + from .chinese_document_splitter import ChineseDocumentSplitter from .csv_document_cleaner import CSVDocumentCleaner from .csv_document_splitter import CSVDocumentSplitter from .document_cleaner import DocumentCleaner diff --git a/haystack/components/preprocessors/chinese_document_splitter.py b/haystack/components/preprocessors/chinese_document_splitter.py new file mode 100644 index 0000000000..552b53b7f6 --- /dev/null +++ b/haystack/components/preprocessors/chinese_document_splitter.py @@ -0,0 +1,361 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from copy import deepcopy +from typing import Any, Dict, List, Literal, Tuple + +from more_itertools import windowed + +from haystack import Document, component, logging +from haystack.components.preprocessors import DocumentSplitter +from haystack.lazy_imports import LazyImport + +with LazyImport("Run 'pip install hanlp'") as hanlp_import: + import hanlp + + +logger = logging.getLogger(__name__) + +# mapping of split by character, 'function' and 'sentence' don't split by character +_CHARACTER_SPLIT_BY_MAPPING = {"page": "\f", "passage": "\n\n", "period": ".", "word": " ", "line": "\n"} + +# chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH) +# chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH) +# Load Chinese sentence slicer +# split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL) + + +@component +class ChineseDocumentSplitter(DocumentSplitter): + def __init__(self, *args, particle_size: Literal["coarse", "fine"] = "coarse", **kwargs): + """ + A DocumentSplitter for Chinese text. + + 'coarse' represents coarse granularity Chinese word segmentation, 'fine' represents fine granularity word + segmentation, default is coarse granularity word segmentation. + + :param particle_size: The granularity of Chinese word segmentation, either 'coarse' or 'fine'. + + """ + super(ChineseDocumentSplitter, self).__init__(*args, **kwargs) + self.particle_size = particle_size + + hanlp_import.check() + + self.chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH) + self.chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH) + self.split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL) # 加载中文的句子切分器 + + def _split_by_character(self, doc) -> List[Document]: + """ + Define a function to handle Chinese clauses + + :param doc: + :return: + """ + split_at = _CHARACTER_SPLIT_BY_MAPPING[self.split_by] + + # 'coarse' represents coarse granularity Chinese word segmentation, + # 'fine' represents fine granularity word segmentation, + # default is coarse granularity word segmentation + + if self.language == "zh" and self.particle_size == "coarse": + units = self.chinese_tokenizer_coarse(doc.content) + + if self.language == "zh" and self.particle_size == "fine": + units = self.chinese_tokenizer_fine(doc.content) + + if self.language == "en": + units = doc.content.split(split_at) + # Add the delimiter back to all units except the last one + + for i in range(len(units) - 1): # pylint: disable=possibly-used-before-assignment + units[i] += split_at + text_splits, splits_pages, splits_start_idxs = self._concatenate_units( + units, self.split_length, self.split_overlap, self.split_threshold + ) + metadata = deepcopy(doc.meta) + metadata["source_id"] = doc.id + + return self._create_docs_from_splits( + text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata + ) + + # Define a function to handle Chinese clauses + def chinese_sentence_split(self, text: str) -> list: + """Split Chinese text into sentences.""" + # Split sentences + sentences = self.split_sent(text) + + # Organize the format of segmented sentences + results = [] + start = 0 + for sentence in sentences: + start = text.find(sentence, start) + end = start + len(sentence) + results.append({"sentence": sentence + "\n", "start": start, "end": end}) + start = end + + return results + + def _split_document(self, doc: Document) -> List[Document]: + if self.split_by == "sentence" or self.respect_sentence_boundary: + return self._split_by_nltk_sentence(doc) + + if self.split_by == "function" and self.splitting_function is not None: + return self._split_by_function(doc) + + return self._split_by_character(doc) + + def _concatenate_sentences_based_on_word_amount( # pylint: disable=too-many-positional-arguments + self, sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str + ) -> Tuple[List[str], List[int], List[int]]: + """ + Groups the sentences into chunks of `split_length` words while respecting sentence boundaries. + + This function is only used when splitting by `word` and `respect_sentence_boundary` is set to `True`, i.e.: + with NLTK sentence tokenizer. + + :param sentences: The list of sentences to split. + :param split_length: The maximum number of words in each split. + :param split_overlap: The number of overlapping words in each split. + :returns: A tuple containing the concatenated sentences, the start page numbers, and the start indices. + """ + # chunk information + chunk_word_count = 0 + chunk_starting_page_number = 1 + chunk_start_idx = 0 + current_chunk: List[str] = [] + # output lists + split_start_page_numbers = [] + list_of_splits: List[List[str]] = [] + split_start_indices = [] + # chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH) + # chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH) + for sentence_idx, sentence in enumerate(sentences): + current_chunk.append(sentence) + if language == "zh" and particle_size == "coarse": + chunk_word_count += len(self.chinese_tokenizer_coarse(sentence)) + next_sentence_word_count = ( + len(self.chinese_tokenizer_coarse(sentences[sentence_idx + 1])) + if sentence_idx < len(sentences) - 1 + else 0 + ) + if language == "zh" and particle_size == "fine": + chunk_word_count += len(self.chinese_tokenizer_fine(sentence)) + next_sentence_word_count = ( + len(self.chinese_tokenizer_fine(sentences[sentence_idx + 1])) + if sentence_idx < len(sentences) - 1 + else 0 + ) + + # Number of words in the current chunk plus the next sentence is larger than the split_length, + # or we reached the last sentence + if (chunk_word_count + next_sentence_word_count) > split_length or sentence_idx == len(sentences) - 1: # pylint: disable=possibly-used-before-assignment + # Save current chunk and start a new one + list_of_splits.append(current_chunk) + split_start_page_numbers.append(chunk_starting_page_number) + split_start_indices.append(chunk_start_idx) + + # Get the number of sentences that overlap with the next chunk + num_sentences_to_keep = self._number_of_sentences_to_keep( + sentences=current_chunk, + split_length=split_length, + split_overlap=split_overlap, + language=language, + particle_size=particle_size, + ) + # Set up information for the new chunk + if num_sentences_to_keep > 0: + # Processed sentences are the ones that are not overlapping with the next chunk + processed_sentences = current_chunk[:-num_sentences_to_keep] + chunk_starting_page_number += sum(sent.count("\f") for sent in processed_sentences) + chunk_start_idx += len("".join(processed_sentences)) + # Next chunk starts with the sentences that were overlapping with the previous chunk + current_chunk = current_chunk[-num_sentences_to_keep:] + chunk_word_count = sum(len(s.split()) for s in current_chunk) + else: + # Here processed_sentences is the same as current_chunk since there is no overlap + chunk_starting_page_number += sum(sent.count("\f") for sent in current_chunk) + chunk_start_idx += len("".join(current_chunk)) + current_chunk = [] + chunk_word_count = 0 + + # Concatenate the sentences together within each split + text_splits = [] + for split in list_of_splits: + text = "".join(split) + if len(text) > 0: + text_splits.append(text) + + return text_splits, split_start_page_numbers, split_start_indices + + # Add Chinese sentence segmentation and enable it using language=="zh" + def _split_by_nltk_sentence(self, doc: Document) -> List[Document]: + split_docs = [] + + if self.language == "zh": + result = self.chinese_sentence_split(doc.content) + if self.language == "en": + result = self.sentence_splitter.split_sentences(doc.content) # type: ignore # None check is done in run() + + units = [sentence["sentence"] for sentence in result] + + if self.respect_sentence_boundary: + text_splits, splits_pages, splits_start_idxs = self._concatenate_sentences_based_on_word_amount( + sentences=units, + split_length=self.split_length, + split_overlap=self.split_overlap, + language=self.language, + particle_size=self.particle_size, + ) + else: + text_splits, splits_pages, splits_start_idxs = self._concatenate_units( + elements=units, + split_length=self.split_length, + split_overlap=self.split_overlap, + split_threshold=self.split_threshold, + ) + metadata = deepcopy(doc.meta) + metadata["source_id"] = doc.id + split_docs += self._create_docs_from_splits( + text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata + ) + + return split_docs + + def _concatenate_units( + self, elements: List[str], split_length: int, split_overlap: int, split_threshold: int + ) -> Tuple[List[str], List[int], List[int]]: + """ + Concatenates the elements into parts of split_length units. + + Keeps track of the original page number that each element belongs. If the length of the current units is less + than the pre-defined `split_threshold`, it does not create a new split. Instead, it concatenates the current + units with the last split, preventing the creation of excessively small splits. + """ + + text_splits: List[str] = [] + splits_pages: List[int] = [] + splits_start_idxs: List[int] = [] + cur_start_idx = 0 + cur_page = 1 + segments = windowed(elements, n=split_length, step=split_length - split_overlap) + + for seg in segments: + current_units = [unit for unit in seg if unit is not None] + txt = "".join(current_units) + + # check if length of current units is below split_threshold + if len(current_units) < split_threshold and len(text_splits) > 0: + # concatenate the last split with the current one + text_splits[-1] += txt + + # NOTE: This line skips documents that have content="" + elif len(txt) > 0: + text_splits.append(txt) + splits_pages.append(cur_page) + splits_start_idxs.append(cur_start_idx) + + processed_units = current_units[: split_length - split_overlap] + cur_start_idx += len("".join(processed_units)) + + if self.split_by == "page": + num_page_breaks = len(processed_units) + else: + num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units) + + cur_page += num_page_breaks + + return text_splits, splits_pages, splits_start_idxs + + def _create_docs_from_splits( + self, text_splits: List[str], splits_pages: List[int], splits_start_idxs: List[int], meta: Dict[str, Any] + ) -> List[Document]: + """ + Creates Document objects from splits enriching them with page number and the metadata of the original document. + """ + documents: List[Document] = [] + + for i, (txt, split_idx) in enumerate(zip(text_splits, splits_start_idxs)): + copied_meta = deepcopy(meta) + copied_meta["page_number"] = splits_pages[i] + copied_meta["split_id"] = i + copied_meta["split_idx_start"] = split_idx + doc = Document(content=txt, meta=copied_meta) + documents.append(doc) + + if self.split_overlap <= 0: + continue + + doc.meta["_split_overlap"] = [] + + if i == 0: + continue + + doc_start_idx = splits_start_idxs[i] + previous_doc = documents[i - 1] + previous_doc_start_idx = splits_start_idxs[i - 1] + self._add_split_overlap_information(doc, doc_start_idx, previous_doc, previous_doc_start_idx) + + for d in documents: + d.content = d.content.replace(" ", "") + return documents + + @staticmethod + def _add_split_overlap_information( + current_doc: Document, current_doc_start_idx: int, previous_doc: Document, previous_doc_start_idx: int + ): + """ + Adds split overlap information to the current and previous Document's meta. + + :param current_doc: The Document that is being split. + :param current_doc_start_idx: The starting index of the current Document. + :param previous_doc: The Document that was split before the current Document. + :param previous_doc_start_idx: The starting index of the previous Document. + """ + overlapping_range = (current_doc_start_idx - previous_doc_start_idx, len(previous_doc.content)) # type: ignore + + if overlapping_range[0] < overlapping_range[1]: + # type: ignore + overlapping_str = previous_doc.content[overlapping_range[0] : overlapping_range[1]] + + if current_doc.content.startswith(overlapping_str): # type: ignore + # add split overlap information to this Document regarding the previous Document + current_doc.meta["_split_overlap"].append({"doc_id": previous_doc.id, "range": overlapping_range}) + + # add split overlap information to previous Document regarding this Document + overlapping_range = (0, overlapping_range[1] - overlapping_range[0]) + previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range}) + + def _number_of_sentences_to_keep( # pylint: disable=too-many-positional-arguments + self, sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str + ) -> int: + """ + Returns the number of sentences to keep in the next chunk based on the `split_overlap` and `split_length`. + + :param sentences: The list of sentences to split. + :param split_length: The maximum number of words in each split. + :param split_overlap: The number of overlapping words in each split. + :returns: The number of sentences to keep in the next chunk. + """ + # If the split_overlap is 0, we don't need to keep any sentences + if split_overlap == 0: + return 0 + + num_sentences_to_keep = 0 + num_words = 0 + + for sent in reversed(sentences[1:]): + if language == "zh" and particle_size == "coarse": + num_words += len(self.chinese_tokenizer_coarse(sent)) + if language == "zh" and particle_size == "fine": + num_words += len(self.chinese_tokenizer_fine(sent)) + # If the number of words is larger than the split_length then don't add any more sentences + if num_words > split_length: + break + num_sentences_to_keep += 1 + if num_words > split_overlap: + break + return num_sentences_to_keep diff --git a/pyproject.toml b/pyproject.toml index 73cd0d2cae..317db06c6c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -115,7 +115,8 @@ dependencies = [ "python-oxmsg", # MSGToDocument "nltk>=3.9.1", # NLTKDocumentSplitter, RecursiveDocumentSplitter - "tiktoken", # RecursiveDocumentSplitter + "tiktoken", # RecursiveDocumentSplitter + "hanlp", # ChineseDocumentSplitter # OpenAPI "jsonref", # OpenAPIServiceConnector, OpenAPIServiceToFunctions @@ -290,6 +291,7 @@ warn_unused_configs = true ignore_missing_imports = true check_untyped_defs = true + [[tool.mypy.overrides]] # TODO: Fix component typings module = ["haystack.components.*", "haystack.testing.*"] diff --git a/releasenotes/notes/add-chinese-document-splitter.yaml b/releasenotes/notes/add-chinese-document-splitter.yaml new file mode 100644 index 0000000000..3b8faaec03 --- /dev/null +++ b/releasenotes/notes/add-chinese-document-splitter.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Adding a new component `ChineseDocumentSplitter`, enabling precise splitting of Chinese documents. diff --git a/releasenotes/notes/add-current-date-promptbuilder-ff60c846f5a70dc6.yaml b/releasenotes/notes/add-current-date-promptbuilder-ff60c846f5a70dc6.yaml deleted file mode 100644 index 07dacf43a8..0000000000 --- a/releasenotes/notes/add-current-date-promptbuilder-ff60c846f5a70dc6.yaml +++ /dev/null @@ -1,16 +0,0 @@ ---- -enhancements: - - | - Allow the ability to add the current date inside a template in `PromptBuilder` using the following syntax: - - - `{% now 'UTC' %}`: Get the current date for the UTC timezone. - - - `{% now 'America/Chicago' + 'hours=2' %}`: Add two hours to the current date in the Chicago timezone. - - - `{% now 'Europe/Berlin' - 'weeks=2' %}`: Subtract two weeks from the current date in the Berlin timezone. - - - `{% now 'Pacific/Fiji' + 'hours=2', '%H' %}`: Display only the number of hours after adding two hours to the Fiji timezone. - - - `{% now 'Etc/GMT-4', '%I:%M %p' %}`: Change the date format to AM/PM for the GMT-4 timezone. - - Note that if no date format is provided, the default will be `%Y-%m-%d %H:%M:%S`. Please refer to [list of tz database](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) for a list of timezones. \ No newline at end of file diff --git a/releasenotes/notes/single-meta-in-azureconverter-ce1cc196a9b161f3.yaml b/releasenotes/notes/single-meta-in-azureconverter-ce1cc196a9b161f3.yaml deleted file mode 100644 index ccc9675002..0000000000 --- a/releasenotes/notes/single-meta-in-azureconverter-ce1cc196a9b161f3.yaml +++ /dev/null @@ -1,5 +0,0 @@ ---- -enhancements: - - | - Adds support for single metadata dictionary input in `AzureOCRDocumentConverter`. In this way, additional metadata can be added to all files processed by this component even when the length of the list of sources is unknown. - diff --git a/test/components/preprocessors/test_chinese_document_splitter.py b/test/components/preprocessors/test_chinese_document_splitter.py new file mode 100644 index 0000000000..4d4bfb8433 --- /dev/null +++ b/test/components/preprocessors/test_chinese_document_splitter.py @@ -0,0 +1,133 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import pytest +from haystack import Document +from haystack.components.preprocessors.chinese_document_splitter import ChineseDocumentSplitter + + +class TestChineseDocumentSplitter: + @pytest.fixture + def sample_text(self) -> str: + return "这是第一句话,也是故事的开端,紧接着是第二句话,渐渐引出了背景;随后,翻开新/f的一页,我们读到了这一页的第一句话,继续延展出情节的发展,直到这页的第二句话将整段文字温柔地收束于平静之中。" + + def test_split_by_word(self, sample_text): + """ + Test splitting by word. + + Note on Chinese words: + Unlike English where words are usually separated by spaces, + Chinese text is written continuously without spaces between words. + Chinese words can consist of multiple characters. + For example, the English word "America" is translated to "美国" in Chinese, + which consists of two characters but is treated as a single word. + Similarly, "Portugal" is "葡萄牙" in Chinese, + three characters but one word. + Therefore, splitting by word means splitting by these multi-character tokens, + not simply by single characters or spaces. + """ + splitter = ChineseDocumentSplitter( + split_by="word", language="zh", particle_size="coarse", split_length=5, split_overlap=0 + ) + if hasattr(splitter, "warm_up"): + splitter.warm_up() + + result = splitter.run(documents=[Document(content=sample_text)]) + docs = result["documents"] + + assert all(isinstance(doc, Document) for doc in docs) + assert all(len(doc.content.strip()) <= 10 for doc in docs) + + def test_split_by_sentence(self, sample_text): + splitter = ChineseDocumentSplitter( + split_by="sentence", language="zh", particle_size="coarse", split_length=10, split_overlap=0 + ) + if hasattr(splitter, "warm_up"): + splitter.warm_up() + + result = splitter.run(documents=[Document(content=sample_text)]) + docs = result["documents"] + + assert all(isinstance(doc, Document) for doc in docs) + assert all(doc.content.strip() != "" for doc in docs) + assert any("。" in doc.content for doc in docs), "Expected at least one chunk containing a full stop." + + def test_respect_sentence_boundary(self): + """Test that respect_sentence_boundary=True avoids splitting sentences""" + text = "这是第一句话,这是第二句话,这是第三句话。这是第四句话,这是第五句话,这是第六句话!这是第七句话,这是第八句话,这是第九句话?" + doc = Document(content=text) + + splitter = ChineseDocumentSplitter( + split_by="word", split_length=10, split_overlap=3, language="zh", respect_sentence_boundary=True + ) + splitter.warm_up() + result = splitter.run(documents=[doc]) + docs = result["documents"] + + print(f"Total chunks created: {len(docs)}.") + for i, d in enumerate(docs): + print(f"\nChunk {i + 1}:\n{d.content}") + # Optional: check that sentences are not cut off + assert d.content.strip().endswith(("。", "!", "?")), "Sentence was cut off!" + + def test_overlap_chunks_with_long_text(self): + """Test split_overlap parameter to ensure there is clear overlap between chunks of long text""" + text = ( + "月光轻轻洒落,林中传来阵阵狼嚎,夜色悄然笼罩一切。" + "树叶在微风中沙沙作响,影子在地面上摇曳不定。" + "一只猫头鹰静静地眨了眨眼,从枝头注视着四周……" + "远处的小溪哗啦啦地流淌,仿佛在向石头倾诉着什么。" + "“咔嚓”一声,某处的树枝突然断裂,然后恢复了寂静。" + "空气中弥漫着松树与湿土的气息,令人心安。" + "一只狐狸悄然出现,又迅速消失在灌木丛中。" + "天上的星星闪烁着,仿佛在诉说古老的故事。" + "时间仿佛停滞了……" + "万物静候,聆听着夜的呼吸!" + ) + doc = Document(content=text) + + splitter = ChineseDocumentSplitter( + split_by="word", language="zh", split_length=30, split_overlap=10, particle_size="coarse" + ) + if hasattr(splitter, "warm_up"): + splitter.warm_up() + + result = splitter.run(documents=[doc]) + docs = result["documents"] + + print(f"Total chunks generated: {len(docs)}.") + for i, d in enumerate(docs): + print(f"\nChunk {i + 1}:\n{d.content}") + + assert len(docs) > 1, "Expected multiple chunks to be generated" + + max_len_allowed = 80 # Allow a somewhat relaxed max chunk length + assert all(len(doc.content) <= max_len_allowed for doc in docs), ( + f"Some chunks exceed {max_len_allowed} characters" + ) + + def has_any_overlap(suffix: str, prefix: str) -> bool: + """ + Check if suffix and prefix have at least one continuous overlapping character sequence. + Tries from the longest possible overlap down to 1 character. + Returns True if any overlap found. + """ + max_check_len = min(len(suffix), len(prefix)) + for length in range(max_check_len, 0, -1): + if suffix[-length:] == prefix[:length]: + return True + return False + + for i in range(1, len(docs)): + prev_chunk = docs[i - 1].content + curr_chunk = docs[i].content + + # Take last 20 chars of prev chunk and first 20 chars of current chunk to check overlap + overlap_prev = prev_chunk[-20:] + overlap_curr = curr_chunk[:20] + + assert has_any_overlap(overlap_prev, overlap_curr), ( + f"Chunks {i} and {i + 1} do not overlap. " + f"Tail (up to 20 chars): '{overlap_prev}' vs Head (up to 20 chars): '{overlap_curr}'" + )