InternScience
diff --git a/‎graphgen/bases/base_splitter.py‎
Lines changed: 102 additions & 14 deletions b/‎graphgen/bases/base_splitter.py‎
Lines changed: 102 additions & 14 deletions
diff --git a/‎graphgen/models/splitter/__init__.py‎ b/‎graphgen/models/splitter/__init__.py‎
diff --git a/‎graphgen/models/splitter/character_splitter.py‎
Lines changed: 26 additions & 0 deletions b/‎graphgen/models/splitter/character_splitter.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎graphgen/models/splitter/markdown_splitter.py‎
Lines changed: 32 additions & 0 deletions b/‎graphgen/models/splitter/markdown_splitter.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎graphgen/models/splitter/recursive_character_splitter.py‎
Lines changed: 150 additions & 0 deletions b/‎graphgen/models/splitter/recursive_character_splitter.py‎
Lines changed: 150 additions & 0 deletions
diff --git a/‎tests/__init__.py‎ b/‎tests/__init__.py‎
diff --git a/‎tests/integration_tests/__init__.py‎ b/‎tests/integration_tests/__init__.py‎
diff --git a/‎tests/integration_tests/models/splitter/test_character_splitter.py‎
Lines changed: 30 additions & 0 deletions b/‎tests/integration_tests/models/splitter/test_character_splitter.py‎
Lines changed: 30 additions & 0 deletions
@@ -1,9 +1,11 @@
 import copy
+import re
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional
+from typing import Callable, Iterable, List, Literal, Optional, Union
 
 from graphgen.bases.datatypes import Chunk
+from graphgen.utils import logger
 
 
 @dataclass
@@ -13,35 +15,121 @@ class BaseSplitter(ABC):
     """
 
     chunk_size: int = 1024
-    chunk_overlap_size: int = 100
+    chunk_overlap: int = 100
     length_function: Callable[[str], int] = len
     keep_separator: bool = False
     add_start_index: bool = False
+    strip_whitespace: bool = True
 
     @abstractmethod
-    def split_text(self, text: str) -> List[Dict[str, Any]]:
+    def split_text(self, text: str) -> List[str]:
         """
         Split the input text into smaller chunks.
 
         :param text: The input text to be split.
-        :return: A list of dictionaries, each containing a chunk of text and optionally its start index.
+        :return: A list of text chunks.
         """
 
     def create_chunks(
         self, texts: List[str], metadatas: Optional[List[dict]] = None
     ) -> List[Chunk]:
-        """
-        Turn a list of texts into a list of Chunks, with optional metadata.
-        :param texts:
-        :param metadatas:
-        :return:
-        """
+        """Create chunks from a list of texts."""
         _metadatas = metadatas or [{}] * len(texts)
         chunks = []
         for i, text in enumerate(texts):
-            chunks.append(Chunk(content=text, metadata=copy.deepcopy(_metadatas[i])))
+            index = 0
+            previous_chunk_len = 0
+            for chunk in self.split_text(text):
+                metadata = copy.deepcopy(_metadatas[i])
+                if self.add_start_index:
+                    offset = index + previous_chunk_len - self.chunk_overlap
+                    index = text.find(chunk, max(0, offset))
+                    metadata["start_index"] = index
+                    previous_chunk_len = len(chunk)
+                new_chunk = Chunk(content=chunk, metadata=metadata)
+                chunks.append(new_chunk)
+        return chunks
+
+    def _join_chunks(self, chunks: List[str], separator: str) -> Optional[str]:
+        text = separator.join(chunks)
+        if self.strip_whitespace:
+            text = text.strip()
+        if text == "":
+            return None
+        return text
+
+    def _merge_splits(self, splits: Iterable[str], separator: str) -> List[str]:
+        # We now want to combine these smaller pieces into medium size chunks to send to the LLM.
+        separator_len = self.length_function(separator)
+
+        chunks = []
+        current_chunk: List[str] = []
+        total = 0
+        for d in splits:
+            _len = self.length_function(d)
+            if (
+                total + _len + (separator_len if len(current_chunk) > 0 else 0)
+                > self.chunk_size
+            ):
+                if total > self.chunk_size:
+                    logger.warning(
+                        "Created a chunk of size %s, which is longer than the specified %s",
+                        total,
+                        self.chunk_size,
+                    )
+                if len(current_chunk) > 0:
+                    chunk = self._join_chunks(current_chunk, separator)
+                    if chunk is not None:
+                        chunks.append(chunk)
+                    # Keep on popping if:
+                    # - we have a larger chunk than in the chunk overlap
+                    # - or if we still have any chunks and the length is long
+                    while total > self.chunk_overlap or (
+                        total + _len + (separator_len if len(current_chunk) > 0 else 0)
+                        > self.chunk_size
+                        and total > 0
+                    ):
+                        total -= self.length_function(current_chunk[0]) + (
+                            separator_len if len(current_chunk) > 1 else 0
+                        )
+                        current_chunk = current_chunk[1:]
+            current_chunk.append(d)
+            total += _len + (separator_len if len(current_chunk) > 1 else 0)
+        chunk = self._join_chunks(current_chunk, separator)
+        if chunk is not None:
+            chunks.append(chunk)
         return chunks
 
-    def split(self, text: str, metadata: Optional[dict] = None) -> List[Chunk]:
-        texts = self.split_text(text)
-        return self.create_chunks(texts, [metadata] * len(texts) if metadata else None)
+    @staticmethod
+    def _split_text_with_regex(
+        text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]]
+    ) -> List[str]:
+        # Now that we have the separator, split the text
+        if separator:
+            if keep_separator:
+                # The parentheses in the pattern keep the delimiters in the result.
+                _splits = re.split(f"({separator})", text)
+                splits = (
+                    (
+                        [
+                            _splits[i] + _splits[i + 1]
+                            for i in range(0, len(_splits) - 1, 2)
+                        ]
+                    )
+                    if keep_separator == "end"
+                    else (
+                        [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
+                    )
+                )
+                if len(_splits) % 2 == 0:
+                    splits += _splits[-1:]
+                splits = (
+                    (splits + [_splits[-1]])
+                    if keep_separator == "end"
+                    else ([_splits[0]] + splits)
+                )
+            else:
+                splits = re.split(separator, text)
+        else:
+            splits = list(text)
+        return [s for s in splits if s != ""]
@@ -0,0 +1,26 @@
+import re
+from typing import Any, List
+
+from graphgen.bases.base_splitter import BaseSplitter
+
+
+class CharacterSplitter(BaseSplitter):
+    """Splitting text that looks at characters."""
+
+    def __init__(
+        self, separator: str = "\n\n", is_separator_regex: bool = False, **kwargs: Any
+    ) -> None:
+        """Create a new TextSplitter."""
+        super().__init__(**kwargs)
+        self._separator = separator
+        self._is_separator_regex = is_separator_regex
+
+    def split_text(self, text: str) -> List[str]:
+        """Split incoming text and return chunks."""
+        # First we naively split the large input into a bunch of smaller ones.
+        separator = (
+            self._separator if self._is_separator_regex else re.escape(self._separator)
+        )
+        splits = self._split_text_with_regex(text, separator, self.keep_separator)
+        _separator = "" if self.keep_separator else self._separator
+        return self._merge_splits(splits, _separator)
@@ -0,0 +1,32 @@
+from typing import Any
+
+from graphgen.models.splitter.recursive_character_splitter import (
+    RecursiveCharacterSplitter,
+)
+
+
+class MarkdownTextRefSplitter(RecursiveCharacterSplitter):
+    """Attempts to split the text along Markdown-formatted headings."""
+
+    def __init__(self, **kwargs: Any) -> None:
+        """Initialize a MarkdownTextRefSplitter."""
+        separators = [
+            # First, try to split along Markdown headings (starting with level 2)
+            "\n#{1,6} ",
+            # Note the alternative syntax for headings (below) is not handled here
+            # Heading level 2
+            # ---------------
+            # End of code block
+            "```\n",
+            # Horizontal lines
+            "\n\\*\\*\\*+\n",
+            "\n---+\n",
+            "\n___+\n",
+            # Note that this splitter doesn't handle horizontal lines defined
+            # by *three or more* of ***, ---, or ___, but this is not handled
+            "\n\n",
+            "\n",
+            " ",
+            "",
+        ]
+        super().__init__(separators=separators, **kwargs)
@@ -0,0 +1,150 @@
+import re
+from typing import Any, List, Optional
+
+from graphgen.bases.base_splitter import BaseSplitter
+
+
+class RecursiveCharacterSplitter(BaseSplitter):
+    """Splitting text by recursively look at characters.
+
+    Recursively tries to split by different characters to find one that works.
+    """
+
+    def __init__(
+        self,
+        separators: Optional[List[str]] = None,
+        keep_separator: bool = True,
+        is_separator_regex: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        """Create a new TextSplitter."""
+        super().__init__(keep_separator=keep_separator, **kwargs)
+        self._separators = separators or ["\n\n", "\n", " ", ""]
+        self._is_separator_regex = is_separator_regex
+
+    def _split_text(self, text: str, separators: List[str]) -> List[str]:
+        """Split incoming text and return chunks."""
+        final_chunks = []
+        # Get appropriate separator to use
+        separator = separators[-1]
+        new_separators = []
+        for i, _s in enumerate(separators):
+            _separator = _s if self._is_separator_regex else re.escape(_s)
+            if _s == "":
+                separator = _s
+                break
+            if re.search(_separator, text):
+                separator = _s
+                new_separators = separators[i + 1 :]
+                break
+
+        _separator = separator if self._is_separator_regex else re.escape(separator)
+        splits = self._split_text_with_regex(text, _separator, self.keep_separator)
+
+        # Now go merging things, recursively splitting longer texts.
+        _good_splits = []
+        _separator = "" if self.keep_separator else separator
+        for s in splits:
+            if self.length_function(s) < self.chunk_size:
+                _good_splits.append(s)
+            else:
+                if _good_splits:
+                    merged_text = self._merge_splits(_good_splits, _separator)
+                    final_chunks.extend(merged_text)
+                    _good_splits = []
+                if not new_separators:
+                    final_chunks.append(s)
+                else:
+                    other_info = self._split_text(s, new_separators)
+                    final_chunks.extend(other_info)
+        if _good_splits:
+            merged_text = self._merge_splits(_good_splits, _separator)
+            final_chunks.extend(merged_text)
+        return final_chunks
+
+    def split_text(self, text: str) -> List[str]:
+        return self._split_text(text, self._separators)
+
+
+class ChineseRecursiveTextSplitter(RecursiveCharacterSplitter):
+    def __init__(
+        self,
+        separators: Optional[List[str]] = None,
+        keep_separator: bool = True,
+        is_separator_regex: bool = True,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(keep_separator=keep_separator, **kwargs)
+        self._separators = separators or [
+            "\n\n",
+            "\n",
+            "。|！|？",
+            r"\.\s|\!\s|\?\s",
+            r"；|;\s",
+            r"，|,\s",
+        ]
+        self._is_separator_regex = is_separator_regex
+
+    def _split_text_with_regex_from_end(
+        self, text: str, separator: str, keep_separator: bool
+    ) -> List[str]:
+        # Now that we have the separator, split the text
+        if separator:
+            if keep_separator:
+                # The parentheses in the pattern keep the delimiters in the result.
+                _splits = re.split(f"({separator})", text)
+                splits = ["".join(i) for i in zip(_splits[0::2], _splits[1::2])]
+                if len(_splits) % 2 == 1:
+                    splits += _splits[-1:]
+                # splits = [_splits[0]] + splits
+            else:
+                splits = re.split(separator, text)
+        else:
+            splits = list(text)
+        return [s for s in splits if s != ""]
+
+    def _split_text(self, text: str, separators: List[str]) -> List[str]:
+        """Split incoming text and return chunks."""
+        final_chunks = []
+        # Get appropriate separator to use
+        separator = separators[-1]
+        new_separators = []
+        for i, _s in enumerate(separators):
+            _separator = _s if self._is_separator_regex else re.escape(_s)
+            if _s == "":
+                separator = _s
+                break
+            if re.search(_separator, text):
+                separator = _s
+                new_separators = separators[i + 1 :]
+                break
+
+        _separator = separator if self._is_separator_regex else re.escape(separator)
+        splits = self._split_text_with_regex_from_end(
+            text, _separator, self.keep_separator
+        )
+
+        # Now go merging things, recursively splitting longer texts.
+        _good_splits = []
+        _separator = "" if self.keep_separator else separator
+        for s in splits:
+            if self.length_function(s) < self.chunk_size:
+                _good_splits.append(s)
+            else:
+                if _good_splits:
+                    merged_text = self._merge_splits(_good_splits, _separator)
+                    final_chunks.extend(merged_text)
+                    _good_splits = []
+                if not new_separators:
+                    final_chunks.append(s)
+                else:
+                    other_info = self._split_text(s, new_separators)
+                    final_chunks.extend(other_info)
+        if _good_splits:
+            merged_text = self._merge_splits(_good_splits, _separator)
+            final_chunks.extend(merged_text)
+        return [
+            re.sub(r"\n{2,}", "\n", chunk.strip())
+            for chunk in final_chunks
+            if chunk.strip() != ""
+        ]
@@ -0,0 +1,30 @@
+import pytest
+
+from graphgen.models.splitter.character_splitter import CharacterSplitter
+
+
+@pytest.mark.parametrize(
+    "text,chunk_size,chunk_overlap,expected",
+    [
+        (
+            "This is a test.\n\nThis is only a test.\n\nIn the event of an actual emergency...",
+            25,
+            5,
+            [
+                "This is a test.",
+                "This is only a test.",
+                "In the event of an actual emergency...",
+            ],
+        ),
+    ],
+)
+def test_character_splitter(text, chunk_size, chunk_overlap, expected):
+    splitter = CharacterSplitter(
+        separator="\n\n",
+        is_separator_regex=False,
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        keep_separator=False,
+    )
+    chunks = splitter.split_text(text)
+    assert chunks == expected