fix(nodes): use actual text span for SentenceChunker chunk-size accounting

nihalnihalani · claude · nihalnihalani · commit fab5892d227f · 2026-04-06T11:06:26.000-07:00
Replace the current_len tracker (which assumed 1-char separators) with
position-based span computation that accounts for multi-character
whitespace between sentences.  After the whitespace-preservation change
(text[start_char:end_char] slicing), current_len was underestimating
the real chunk width whenever the original separator was longer than one
character (e.g. \n\n), causing chunks to silently exceed chunk_size.

The overlap budget is now also computed from actual text spans so
carried-over sentences fit within the configured overlap window.

Add two regression tests:
- overlap + repeated sentences: verify spans match text and respect size
- multi-char whitespace: verify chunks are split, not all swallowed

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/nodes/src/nodes/chunker/chunker_strategies.py b/nodes/src/nodes/chunker/chunker_strategies.py
@@ -193,7 +193,6 @@ def chunk(self, text: str) -> list[dict]:
         result = []
         chunk_index = 0
         current_sentences: list[str] = []
-        current_len = 0
         # Track where each sentence starts in the original text to handle repeated sentences
         sentence_positions: list[int] = []
         search_start = 0
@@ -205,15 +204,25 @@ def chunk(self, text: str) -> list[dict]:
             sentence_start_map.append(pos)
             search_start = pos + len(sentence)
 
+        def _span_len(positions: list[int], sents: list[str]) -> int:
+            """Compute the actual text span from first position to end of last sentence."""
+            if not positions:
+                return 0
+            return (positions[-1] + len(sents[-1])) - positions[0]
+
         for sent_idx, sentence in enumerate(sentences):
-            sentence_len = len(sentence)
+            next_pos = sentence_start_map[sent_idx]
+
+            # Compute what the span would be if we added this sentence
+            if current_sentences:
+                candidate_len = (next_pos + len(sentence)) - sentence_positions[0]
+            else:
+                candidate_len = len(sentence)
 
             # If adding this sentence exceeds chunk_size and we have content, finalize current
-            if current_sentences and current_len + (1 if current_len > 0 else 0) + sentence_len > self.chunk_size:
-                start_char = sentence_positions[0] if sentence_positions else 0
-                last_sent_pos = sentence_positions[-1] if sentence_positions else start_char
-                last_sent = current_sentences[-1] if current_sentences else ''
-                end_char = last_sent_pos + len(last_sent)
+            if current_sentences and candidate_len > self.chunk_size:
+                start_char = sentence_positions[0]
+                end_char = sentence_positions[-1] + len(current_sentences[-1])
                 chunk_text = text[start_char:end_char]
 
                 result.append(
@@ -228,41 +237,32 @@ def chunk(self, text: str) -> list[dict]:
                 )
                 chunk_index += 1
 
-                # Compute overlap: keep trailing sentences that fit within overlap
+                # Compute overlap: keep trailing sentences whose actual span fits within overlap
                 if self.chunk_overlap > 0:
                     overlap_sentences: list[str] = []
                     overlap_positions: list[int] = []
-                    overlap_len = 0
                     for i in range(len(current_sentences) - 1, -1, -1):
-                        s = current_sentences[i]
-                        candidate = len(s) + (1 if overlap_len > 0 else 0) + overlap_len
-                        if candidate <= self.chunk_overlap:
-                            overlap_sentences.insert(0, s)
-                            overlap_positions.insert(0, sentence_positions[i])
-                            overlap_len = candidate
+                        trial_positions = [sentence_positions[i]] + overlap_positions
+                        trial_sentences = [current_sentences[i]] + overlap_sentences
+                        if _span_len(trial_positions, trial_sentences) <= self.chunk_overlap:
+                            overlap_sentences = trial_sentences
+                            overlap_positions = trial_positions
                         else:
                             break
                     current_sentences = overlap_sentences
                     sentence_positions = overlap_positions
-                    current_len = overlap_len
                 else:
                     current_sentences = []
                     sentence_positions = []
-                    current_len = 0
 
             # Add the sentence
-            if current_len > 0:
-                current_len += 1  # space separator
-            current_len += sentence_len
             current_sentences.append(sentence)
-            sentence_positions.append(sentence_start_map[sent_idx])
+            sentence_positions.append(next_pos)
 
         # Emit the final chunk
         if current_sentences:
-            start_char = sentence_positions[0] if sentence_positions else 0
-            last_sent_pos = sentence_positions[-1] if sentence_positions else start_char
-            last_sent = current_sentences[-1] if current_sentences else ''
-            end_char = last_sent_pos + len(last_sent)
+            start_char = sentence_positions[0]
+            end_char = sentence_positions[-1] + len(current_sentences[-1])
             chunk_text = text[start_char:end_char]
 
             result.append(
diff --git a/test/nodes/test_chunker.py b/test/nodes/test_chunker.py
@@ -257,6 +257,37 @@ def test_repeated_sentences_no_overlap_in_positions(self):
             for i in range(1, len(chunks)):
                 assert chunks[i]['metadata']['start_char'] >= chunks[i - 1]['metadata']['end_char']
 
+    def test_overlap_with_repeated_sentences_correct_spans(self):
+        """Overlap + repeated sentences: spans must match actual text and respect chunk_size."""
+        chunker = SentenceChunker(chunk_size=20, chunk_overlap=10)
+        text = 'Go. Go. Go. Go. Go. Go. Go. Go. Stop.'
+        chunks = chunker.chunk(text)
+        assert len(chunks) >= 2
+
+        for i, chunk in enumerate(chunks):
+            meta = chunk['metadata']
+            # The slice must match the chunk text exactly
+            assert chunk['text'] == text[meta['start_char'] : meta['end_char']], f'Chunk {i} text mismatch: {chunk["text"]!r} != {text[meta["start_char"] : meta["end_char"]]!r}'
+            # Actual span must not wildly exceed chunk_size (allow single-sentence overflow)
+            actual_span = meta['end_char'] - meta['start_char']
+            max_sentence_len = max(len(s) for s in ['Go.', 'Stop.'])
+            assert actual_span <= chunker.chunk_size + max_sentence_len, f'Chunk {i} span {actual_span} exceeds chunk_size {chunker.chunk_size} + max_sentence {max_sentence_len}'
+
+    def test_overlap_with_multichar_whitespace_respects_chunk_size(self):
+        """Multi-char whitespace between sentences must not cause unbounded chunk growth."""
+        chunker = SentenceChunker(chunk_size=20, chunk_overlap=10)
+        text = 'A.\n\n\n\nB.\n\n\n\nC.\n\n\n\nD.\n\n\n\nE.'
+        chunks = chunker.chunk(text)
+        assert len(chunks) >= 2, f'Expected multiple chunks but got {len(chunks)}'
+
+        for i, chunk in enumerate(chunks):
+            meta = chunk['metadata']
+            actual_span = meta['end_char'] - meta['start_char']
+            # With correct span tracking, no single chunk should swallow the entire text
+            assert actual_span <= len(text), f'Chunk {i} span {actual_span} exceeds text length'
+            # The slice must match
+            assert chunk['text'] == text[meta['start_char'] : meta['end_char']]
+
 
 # ===========================================================================
 # TokenChunker tests