diff --git a/haystack/components/preprocessors/__init__.py b/haystack/components/preprocessors/__init__.py
index cdbdf4a7a2..330bc8e05f 100644
--- a/haystack/components/preprocessors/__init__.py
+++ b/haystack/components/preprocessors/__init__.py
@@ -8,6 +8,7 @@
 from lazy_imports import LazyImporter
 
 _import_structure = {
+    "chinese_document_splitter": ["ChineseDocumentSplitter"],
     "csv_document_cleaner": ["CSVDocumentCleaner"],
     "csv_document_splitter": ["CSVDocumentSplitter"],
     "document_cleaner": ["DocumentCleaner"],
@@ -19,6 +20,7 @@
 }
 
 if TYPE_CHECKING:
+    from .chinese_document_splitter import ChineseDocumentSplitter
     from .csv_document_cleaner import CSVDocumentCleaner
     from .csv_document_splitter import CSVDocumentSplitter
     from .document_cleaner import DocumentCleaner
diff --git a/haystack/components/preprocessors/chinese_document_splitter.py b/haystack/components/preprocessors/chinese_document_splitter.py
new file mode 100644
index 0000000000..552b53b7f6
--- /dev/null
+++ b/haystack/components/preprocessors/chinese_document_splitter.py
@@ -0,0 +1,361 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from copy import deepcopy
+from typing import Any, Dict, List, Literal, Tuple
+
+from more_itertools import windowed
+
+from haystack import Document, component, logging
+from haystack.components.preprocessors import DocumentSplitter
+from haystack.lazy_imports import LazyImport
+
+with LazyImport("Run 'pip install hanlp'") as hanlp_import:
+    import hanlp
+
+
+logger = logging.getLogger(__name__)
+
+# mapping of split by character, 'function' and 'sentence' don't split by character
+_CHARACTER_SPLIT_BY_MAPPING = {"page": "\f", "passage": "\n\n", "period": ".", "word": " ", "line": "\n"}
+
+# chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
+# chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
+# Load Chinese sentence slicer
+# split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL)
+
+
+@component
+class ChineseDocumentSplitter(DocumentSplitter):
+    def __init__(self, *args, particle_size: Literal["coarse", "fine"] = "coarse", **kwargs):
+        """
+        A DocumentSplitter for Chinese text.
+
+        'coarse' represents coarse granularity Chinese word segmentation, 'fine' represents fine granularity word
+        segmentation, default is coarse granularity word segmentation.
+
+        :param particle_size: The granularity of Chinese word segmentation, either 'coarse' or 'fine'.
+
+        """
+        super(ChineseDocumentSplitter, self).__init__(*args, **kwargs)
+        self.particle_size = particle_size
+
+        hanlp_import.check()
+
+        self.chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
+        self.chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
+        self.split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL)  # 加载中文的句子切分器
+
+    def _split_by_character(self, doc) -> List[Document]:
+        """
+        Define a function to handle Chinese clauses
+
+        :param doc:
+        :return:
+        """
+        split_at = _CHARACTER_SPLIT_BY_MAPPING[self.split_by]
+
+        # 'coarse' represents coarse granularity Chinese word segmentation,
+        # 'fine' represents fine granularity word segmentation,
+        #  default is coarse granularity word segmentation
+
+        if self.language == "zh" and self.particle_size == "coarse":
+            units = self.chinese_tokenizer_coarse(doc.content)
+
+        if self.language == "zh" and self.particle_size == "fine":
+            units = self.chinese_tokenizer_fine(doc.content)
+
+        if self.language == "en":
+            units = doc.content.split(split_at)
+            # Add the delimiter back to all units except the last one
+
+        for i in range(len(units) - 1):  # pylint: disable=possibly-used-before-assignment
+            units[i] += split_at
+        text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
+            units, self.split_length, self.split_overlap, self.split_threshold
+        )
+        metadata = deepcopy(doc.meta)
+        metadata["source_id"] = doc.id
+
+        return self._create_docs_from_splits(
+            text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata
+        )
+
+    # Define a function to handle Chinese clauses
+    def chinese_sentence_split(self, text: str) -> list:
+        """Split Chinese text into sentences."""
+        # Split sentences
+        sentences = self.split_sent(text)
+
+        # Organize the format of segmented sentences
+        results = []
+        start = 0
+        for sentence in sentences:
+            start = text.find(sentence, start)
+            end = start + len(sentence)
+            results.append({"sentence": sentence + "\n", "start": start, "end": end})
+            start = end
+
+        return results
+
+    def _split_document(self, doc: Document) -> List[Document]:
+        if self.split_by == "sentence" or self.respect_sentence_boundary:
+            return self._split_by_nltk_sentence(doc)
+
+        if self.split_by == "function" and self.splitting_function is not None:
+            return self._split_by_function(doc)
+
+        return self._split_by_character(doc)
+
+    def _concatenate_sentences_based_on_word_amount(  # pylint: disable=too-many-positional-arguments
+        self, sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
+    ) -> Tuple[List[str], List[int], List[int]]:
+        """
+        Groups the sentences into chunks of `split_length` words while respecting sentence boundaries.
+
+        This function is only used when splitting by `word` and `respect_sentence_boundary` is set to `True`, i.e.:
+        with NLTK sentence tokenizer.
+
+        :param sentences: The list of sentences to split.
+        :param split_length: The maximum number of words in each split.
+        :param split_overlap: The number of overlapping words in each split.
+        :returns: A tuple containing the concatenated sentences, the start page numbers, and the start indices.
+        """
+        # chunk information
+        chunk_word_count = 0
+        chunk_starting_page_number = 1
+        chunk_start_idx = 0
+        current_chunk: List[str] = []
+        # output lists
+        split_start_page_numbers = []
+        list_of_splits: List[List[str]] = []
+        split_start_indices = []
+        # chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
+        # chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
+        for sentence_idx, sentence in enumerate(sentences):
+            current_chunk.append(sentence)
+            if language == "zh" and particle_size == "coarse":
+                chunk_word_count += len(self.chinese_tokenizer_coarse(sentence))
+                next_sentence_word_count = (
+                    len(self.chinese_tokenizer_coarse(sentences[sentence_idx + 1]))
+                    if sentence_idx < len(sentences) - 1
+                    else 0
+                )
+            if language == "zh" and particle_size == "fine":
+                chunk_word_count += len(self.chinese_tokenizer_fine(sentence))
+                next_sentence_word_count = (
+                    len(self.chinese_tokenizer_fine(sentences[sentence_idx + 1]))
+                    if sentence_idx < len(sentences) - 1
+                    else 0
+                )
+
+            # Number of words in the current chunk plus the next sentence is larger than the split_length,
+            # or we reached the last sentence
+            if (chunk_word_count + next_sentence_word_count) > split_length or sentence_idx == len(sentences) - 1:  # pylint: disable=possibly-used-before-assignment
+                #  Save current chunk and start a new one
+                list_of_splits.append(current_chunk)
+                split_start_page_numbers.append(chunk_starting_page_number)
+                split_start_indices.append(chunk_start_idx)
+
+                # Get the number of sentences that overlap with the next chunk
+                num_sentences_to_keep = self._number_of_sentences_to_keep(
+                    sentences=current_chunk,
+                    split_length=split_length,
+                    split_overlap=split_overlap,
+                    language=language,
+                    particle_size=particle_size,
+                )
+                # Set up information for the new chunk
+                if num_sentences_to_keep > 0:
+                    # Processed sentences are the ones that are not overlapping with the next chunk
+                    processed_sentences = current_chunk[:-num_sentences_to_keep]
+                    chunk_starting_page_number += sum(sent.count("\f") for sent in processed_sentences)
+                    chunk_start_idx += len("".join(processed_sentences))
+                    # Next chunk starts with the sentences that were overlapping with the previous chunk
+                    current_chunk = current_chunk[-num_sentences_to_keep:]
+                    chunk_word_count = sum(len(s.split()) for s in current_chunk)
+                else:
+                    # Here processed_sentences is the same as current_chunk since there is no overlap
+                    chunk_starting_page_number += sum(sent.count("\f") for sent in current_chunk)
+                    chunk_start_idx += len("".join(current_chunk))
+                    current_chunk = []
+                    chunk_word_count = 0
+
+        # Concatenate the sentences together within each split
+        text_splits = []
+        for split in list_of_splits:
+            text = "".join(split)
+            if len(text) > 0:
+                text_splits.append(text)
+
+        return text_splits, split_start_page_numbers, split_start_indices
+
+    # Add Chinese sentence segmentation and enable it using language=="zh"
+    def _split_by_nltk_sentence(self, doc: Document) -> List[Document]:
+        split_docs = []
+
+        if self.language == "zh":
+            result = self.chinese_sentence_split(doc.content)
+        if self.language == "en":
+            result = self.sentence_splitter.split_sentences(doc.content)  # type: ignore # None check is done in run()
+
+        units = [sentence["sentence"] for sentence in result]
+
+        if self.respect_sentence_boundary:
+            text_splits, splits_pages, splits_start_idxs = self._concatenate_sentences_based_on_word_amount(
+                sentences=units,
+                split_length=self.split_length,
+                split_overlap=self.split_overlap,
+                language=self.language,
+                particle_size=self.particle_size,
+            )
+        else:
+            text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
+                elements=units,
+                split_length=self.split_length,
+                split_overlap=self.split_overlap,
+                split_threshold=self.split_threshold,
+            )
+        metadata = deepcopy(doc.meta)
+        metadata["source_id"] = doc.id
+        split_docs += self._create_docs_from_splits(
+            text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata
+        )
+
+        return split_docs
+
+    def _concatenate_units(
+        self, elements: List[str], split_length: int, split_overlap: int, split_threshold: int
+    ) -> Tuple[List[str], List[int], List[int]]:
+        """
+        Concatenates the elements into parts of split_length units.
+
+        Keeps track of the original page number that each element belongs. If the length of the current units is less
+        than the pre-defined `split_threshold`, it does not create a new split. Instead, it concatenates the current
+        units with the last split, preventing the creation of excessively small splits.
+        """
+
+        text_splits: List[str] = []
+        splits_pages: List[int] = []
+        splits_start_idxs: List[int] = []
+        cur_start_idx = 0
+        cur_page = 1
+        segments = windowed(elements, n=split_length, step=split_length - split_overlap)
+
+        for seg in segments:
+            current_units = [unit for unit in seg if unit is not None]
+            txt = "".join(current_units)
+
+            # check if length of current units is below split_threshold
+            if len(current_units) < split_threshold and len(text_splits) > 0:
+                # concatenate the last split with the current one
+                text_splits[-1] += txt
+
+            # NOTE: This line skips documents that have content=""
+            elif len(txt) > 0:
+                text_splits.append(txt)
+                splits_pages.append(cur_page)
+                splits_start_idxs.append(cur_start_idx)
+
+            processed_units = current_units[: split_length - split_overlap]
+            cur_start_idx += len("".join(processed_units))
+
+            if self.split_by == "page":
+                num_page_breaks = len(processed_units)
+            else:
+                num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
+
+            cur_page += num_page_breaks
+
+        return text_splits, splits_pages, splits_start_idxs
+
+    def _create_docs_from_splits(
+        self, text_splits: List[str], splits_pages: List[int], splits_start_idxs: List[int], meta: Dict[str, Any]
+    ) -> List[Document]:
+        """
+        Creates Document objects from splits enriching them with page number and the metadata of the original document.
+        """
+        documents: List[Document] = []
+
+        for i, (txt, split_idx) in enumerate(zip(text_splits, splits_start_idxs)):
+            copied_meta = deepcopy(meta)
+            copied_meta["page_number"] = splits_pages[i]
+            copied_meta["split_id"] = i
+            copied_meta["split_idx_start"] = split_idx
+            doc = Document(content=txt, meta=copied_meta)
+            documents.append(doc)
+
+            if self.split_overlap <= 0:
+                continue
+
+            doc.meta["_split_overlap"] = []
+
+            if i == 0:
+                continue
+
+            doc_start_idx = splits_start_idxs[i]
+            previous_doc = documents[i - 1]
+            previous_doc_start_idx = splits_start_idxs[i - 1]
+            self._add_split_overlap_information(doc, doc_start_idx, previous_doc, previous_doc_start_idx)
+
+        for d in documents:
+            d.content = d.content.replace(" ", "")
+        return documents
+
+    @staticmethod
+    def _add_split_overlap_information(
+        current_doc: Document, current_doc_start_idx: int, previous_doc: Document, previous_doc_start_idx: int
+    ):
+        """
+        Adds split overlap information to the current and previous Document's meta.
+
+        :param current_doc: The Document that is being split.
+        :param current_doc_start_idx: The starting index of the current Document.
+        :param previous_doc: The Document that was split before the current Document.
+        :param previous_doc_start_idx: The starting index of the previous Document.
+        """
+        overlapping_range = (current_doc_start_idx - previous_doc_start_idx, len(previous_doc.content))  # type: ignore
+
+        if overlapping_range[0] < overlapping_range[1]:
+            # type: ignore
+            overlapping_str = previous_doc.content[overlapping_range[0] : overlapping_range[1]]
+
+            if current_doc.content.startswith(overlapping_str):  # type: ignore
+                # add split overlap information to this Document regarding the previous Document
+                current_doc.meta["_split_overlap"].append({"doc_id": previous_doc.id, "range": overlapping_range})
+
+                # add split overlap information to previous Document regarding this Document
+                overlapping_range = (0, overlapping_range[1] - overlapping_range[0])
+                previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range})
+
+    def _number_of_sentences_to_keep(  # pylint: disable=too-many-positional-arguments
+        self, sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
+    ) -> int:
+        """
+        Returns the number of sentences to keep in the next chunk based on the `split_overlap` and `split_length`.
+
+        :param sentences: The list of sentences to split.
+        :param split_length: The maximum number of words in each split.
+        :param split_overlap: The number of overlapping words in each split.
+        :returns: The number of sentences to keep in the next chunk.
+        """
+        # If the split_overlap is 0, we don't need to keep any sentences
+        if split_overlap == 0:
+            return 0
+
+        num_sentences_to_keep = 0
+        num_words = 0
+
+        for sent in reversed(sentences[1:]):
+            if language == "zh" and particle_size == "coarse":
+                num_words += len(self.chinese_tokenizer_coarse(sent))
+            if language == "zh" and particle_size == "fine":
+                num_words += len(self.chinese_tokenizer_fine(sent))
+            # If the number of words is larger than the split_length then don't add any more sentences
+            if num_words > split_length:
+                break
+            num_sentences_to_keep += 1
+            if num_words > split_overlap:
+                break
+        return num_sentences_to_keep
diff --git a/pyproject.toml b/pyproject.toml
index 73cd0d2cae..317db06c6c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -115,7 +115,8 @@ dependencies = [
   "python-oxmsg",                     # MSGToDocument
 
   "nltk>=3.9.1", # NLTKDocumentSplitter, RecursiveDocumentSplitter
-  "tiktoken", # RecursiveDocumentSplitter
+  "tiktoken",    # RecursiveDocumentSplitter
+  "hanlp",       # ChineseDocumentSplitter
 
   # OpenAPI
   "jsonref",              # OpenAPIServiceConnector, OpenAPIServiceToFunctions
@@ -290,6 +291,7 @@ warn_unused_configs = true
 ignore_missing_imports = true
 check_untyped_defs = true
 
+
 [[tool.mypy.overrides]]
 # TODO: Fix component typings
 module = ["haystack.components.*", "haystack.testing.*"]
diff --git a/releasenotes/notes/add-chinese-document-splitter.yaml b/releasenotes/notes/add-chinese-document-splitter.yaml
new file mode 100644
index 0000000000..3b8faaec03
--- /dev/null
+++ b/releasenotes/notes/add-chinese-document-splitter.yaml
@@ -0,0 +1,4 @@
+---
+features:
+  - |
+    Adding a new component `ChineseDocumentSplitter`, enabling precise splitting of Chinese documents.
diff --git a/releasenotes/notes/add-current-date-promptbuilder-ff60c846f5a70dc6.yaml b/releasenotes/notes/add-current-date-promptbuilder-ff60c846f5a70dc6.yaml
deleted file mode 100644
index 07dacf43a8..0000000000
--- a/releasenotes/notes/add-current-date-promptbuilder-ff60c846f5a70dc6.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
----
-enhancements:
-  - |
-    Allow the ability to add the current date inside a template in `PromptBuilder` using the following syntax:
-    
-    - `{% now 'UTC' %}`: Get the current date for the UTC timezone.
-        
-    - `{% now 'America/Chicago' + 'hours=2' %}`: Add two hours to the current date in the Chicago timezone.
-        
-    - `{% now 'Europe/Berlin' - 'weeks=2' %}`: Subtract two weeks from the current date in the Berlin timezone.
-        
-    - `{% now 'Pacific/Fiji' + 'hours=2', '%H' %}`: Display only the number of hours after adding two hours to the Fiji timezone.
-        
-    - `{% now 'Etc/GMT-4', '%I:%M %p' %}`: Change the date format to AM/PM for the GMT-4 timezone.
-    
-    Note that if no date format is provided, the default will be `%Y-%m-%d %H:%M:%S`. Please refer to [list of tz database](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) for a list of timezones.
\ No newline at end of file
diff --git a/releasenotes/notes/single-meta-in-azureconverter-ce1cc196a9b161f3.yaml b/releasenotes/notes/single-meta-in-azureconverter-ce1cc196a9b161f3.yaml
deleted file mode 100644
index ccc9675002..0000000000
--- a/releasenotes/notes/single-meta-in-azureconverter-ce1cc196a9b161f3.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
----
-enhancements:
-  - |
-    Adds support for single metadata dictionary input in `AzureOCRDocumentConverter`. In this way, additional metadata can be added to all files processed by this component even when the length of the list of sources is unknown.
-
diff --git a/test/components/preprocessors/test_chinese_document_splitter.py b/test/components/preprocessors/test_chinese_document_splitter.py
new file mode 100644
index 0000000000..4d4bfb8433
--- /dev/null
+++ b/test/components/preprocessors/test_chinese_document_splitter.py
@@ -0,0 +1,133 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+from haystack import Document
+from haystack.components.preprocessors.chinese_document_splitter import ChineseDocumentSplitter
+
+
+class TestChineseDocumentSplitter:
+    @pytest.fixture
+    def sample_text(self) -> str:
+        return "这是第一句话，也是故事的开端，紧接着是第二句话，渐渐引出了背景；随后，翻开新/f的一页，我们读到了这一页的第一句话，继续延展出情节的发展，直到这页的第二句话将整段文字温柔地收束于平静之中。"
+
+    def test_split_by_word(self, sample_text):
+        """
+        Test splitting by word.
+
+        Note on Chinese words:
+        Unlike English where words are usually separated by spaces,
+        Chinese text is written continuously without spaces between words.
+        Chinese words can consist of multiple characters.
+        For example, the English word "America" is translated to "美国" in Chinese,
+        which consists of two characters but is treated as a single word.
+        Similarly, "Portugal" is "葡萄牙" in Chinese,
+        three characters but one word.
+        Therefore, splitting by word means splitting by these multi-character tokens,
+        not simply by single characters or spaces.
+        """
+        splitter = ChineseDocumentSplitter(
+            split_by="word", language="zh", particle_size="coarse", split_length=5, split_overlap=0
+        )
+        if hasattr(splitter, "warm_up"):
+            splitter.warm_up()
+
+        result = splitter.run(documents=[Document(content=sample_text)])
+        docs = result["documents"]
+
+        assert all(isinstance(doc, Document) for doc in docs)
+        assert all(len(doc.content.strip()) <= 10 for doc in docs)
+
+    def test_split_by_sentence(self, sample_text):
+        splitter = ChineseDocumentSplitter(
+            split_by="sentence", language="zh", particle_size="coarse", split_length=10, split_overlap=0
+        )
+        if hasattr(splitter, "warm_up"):
+            splitter.warm_up()
+
+        result = splitter.run(documents=[Document(content=sample_text)])
+        docs = result["documents"]
+
+        assert all(isinstance(doc, Document) for doc in docs)
+        assert all(doc.content.strip() != "" for doc in docs)
+        assert any("。" in doc.content for doc in docs), "Expected at least one chunk containing a full stop."
+
+    def test_respect_sentence_boundary(self):
+        """Test that respect_sentence_boundary=True avoids splitting sentences"""
+        text = "这是第一句话，这是第二句话，这是第三句话。这是第四句话，这是第五句话，这是第六句话！这是第七句话，这是第八句话，这是第九句话？"
+        doc = Document(content=text)
+
+        splitter = ChineseDocumentSplitter(
+            split_by="word", split_length=10, split_overlap=3, language="zh", respect_sentence_boundary=True
+        )
+        splitter.warm_up()
+        result = splitter.run(documents=[doc])
+        docs = result["documents"]
+
+        print(f"Total chunks created: {len(docs)}.")
+        for i, d in enumerate(docs):
+            print(f"\nChunk {i + 1}:\n{d.content}")
+            # Optional: check that sentences are not cut off
+            assert d.content.strip().endswith(("。", "！", "？")), "Sentence was cut off!"
+
+    def test_overlap_chunks_with_long_text(self):
+        """Test split_overlap parameter to ensure there is clear overlap between chunks of long text"""
+        text = (
+            "月光轻轻洒落，林中传来阵阵狼嚎，夜色悄然笼罩一切。"
+            "树叶在微风中沙沙作响，影子在地面上摇曳不定。"
+            "一只猫头鹰静静地眨了眨眼，从枝头注视着四周……"
+            "远处的小溪哗啦啦地流淌，仿佛在向石头倾诉着什么。"
+            "“咔嚓”一声，某处的树枝突然断裂，然后恢复了寂静。"
+            "空气中弥漫着松树与湿土的气息，令人心安。"
+            "一只狐狸悄然出现，又迅速消失在灌木丛中。"
+            "天上的星星闪烁着，仿佛在诉说古老的故事。"
+            "时间仿佛停滞了……"
+            "万物静候，聆听着夜的呼吸！"
+        )
+        doc = Document(content=text)
+
+        splitter = ChineseDocumentSplitter(
+            split_by="word", language="zh", split_length=30, split_overlap=10, particle_size="coarse"
+        )
+        if hasattr(splitter, "warm_up"):
+            splitter.warm_up()
+
+        result = splitter.run(documents=[doc])
+        docs = result["documents"]
+
+        print(f"Total chunks generated: {len(docs)}.")
+        for i, d in enumerate(docs):
+            print(f"\nChunk {i + 1}:\n{d.content}")
+
+        assert len(docs) > 1, "Expected multiple chunks to be generated"
+
+        max_len_allowed = 80  # Allow a somewhat relaxed max chunk length
+        assert all(len(doc.content) <= max_len_allowed for doc in docs), (
+            f"Some chunks exceed {max_len_allowed} characters"
+        )
+
+        def has_any_overlap(suffix: str, prefix: str) -> bool:
+            """
+            Check if suffix and prefix have at least one continuous overlapping character sequence.
+            Tries from the longest possible overlap down to 1 character.
+            Returns True if any overlap found.
+            """
+            max_check_len = min(len(suffix), len(prefix))
+            for length in range(max_check_len, 0, -1):
+                if suffix[-length:] == prefix[:length]:
+                    return True
+            return False
+
+        for i in range(1, len(docs)):
+            prev_chunk = docs[i - 1].content
+            curr_chunk = docs[i].content
+
+            # Take last 20 chars of prev chunk and first 20 chars of current chunk to check overlap
+            overlap_prev = prev_chunk[-20:]
+            overlap_curr = curr_chunk[:20]
+
+            assert has_any_overlap(overlap_prev, overlap_curr), (
+                f"Chunks {i} and {i + 1} do not overlap. "
+                f"Tail (up to 20 chars): '{overlap_prev}' vs Head (up to 20 chars): '{overlap_curr}'"
+            )