Skip to content

Commit fab5892

Browse files
nihalnihalaniclaude
andcommitted
fix(nodes): use actual text span for SentenceChunker chunk-size accounting
Replace the current_len tracker (which assumed 1-char separators) with position-based span computation that accounts for multi-character whitespace between sentences. After the whitespace-preservation change (text[start_char:end_char] slicing), current_len was underestimating the real chunk width whenever the original separator was longer than one character (e.g. \n\n), causing chunks to silently exceed chunk_size. The overlap budget is now also computed from actual text spans so carried-over sentences fit within the configured overlap window. Add two regression tests: - overlap + repeated sentences: verify spans match text and respect size - multi-char whitespace: verify chunks are split, not all swallowed Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent bf59730 commit fab5892

2 files changed

Lines changed: 56 additions & 25 deletions

File tree

nodes/src/nodes/chunker/chunker_strategies.py

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,6 @@ def chunk(self, text: str) -> list[dict]:
193193
result = []
194194
chunk_index = 0
195195
current_sentences: list[str] = []
196-
current_len = 0
197196
# Track where each sentence starts in the original text to handle repeated sentences
198197
sentence_positions: list[int] = []
199198
search_start = 0
@@ -205,15 +204,25 @@ def chunk(self, text: str) -> list[dict]:
205204
sentence_start_map.append(pos)
206205
search_start = pos + len(sentence)
207206

207+
def _span_len(positions: list[int], sents: list[str]) -> int:
208+
"""Compute the actual text span from first position to end of last sentence."""
209+
if not positions:
210+
return 0
211+
return (positions[-1] + len(sents[-1])) - positions[0]
212+
208213
for sent_idx, sentence in enumerate(sentences):
209-
sentence_len = len(sentence)
214+
next_pos = sentence_start_map[sent_idx]
215+
216+
# Compute what the span would be if we added this sentence
217+
if current_sentences:
218+
candidate_len = (next_pos + len(sentence)) - sentence_positions[0]
219+
else:
220+
candidate_len = len(sentence)
210221

211222
# If adding this sentence exceeds chunk_size and we have content, finalize current
212-
if current_sentences and current_len + (1 if current_len > 0 else 0) + sentence_len > self.chunk_size:
213-
start_char = sentence_positions[0] if sentence_positions else 0
214-
last_sent_pos = sentence_positions[-1] if sentence_positions else start_char
215-
last_sent = current_sentences[-1] if current_sentences else ''
216-
end_char = last_sent_pos + len(last_sent)
223+
if current_sentences and candidate_len > self.chunk_size:
224+
start_char = sentence_positions[0]
225+
end_char = sentence_positions[-1] + len(current_sentences[-1])
217226
chunk_text = text[start_char:end_char]
218227

219228
result.append(
@@ -228,41 +237,32 @@ def chunk(self, text: str) -> list[dict]:
228237
)
229238
chunk_index += 1
230239

231-
# Compute overlap: keep trailing sentences that fit within overlap
240+
# Compute overlap: keep trailing sentences whose actual span fits within overlap
232241
if self.chunk_overlap > 0:
233242
overlap_sentences: list[str] = []
234243
overlap_positions: list[int] = []
235-
overlap_len = 0
236244
for i in range(len(current_sentences) - 1, -1, -1):
237-
s = current_sentences[i]
238-
candidate = len(s) + (1 if overlap_len > 0 else 0) + overlap_len
239-
if candidate <= self.chunk_overlap:
240-
overlap_sentences.insert(0, s)
241-
overlap_positions.insert(0, sentence_positions[i])
242-
overlap_len = candidate
245+
trial_positions = [sentence_positions[i]] + overlap_positions
246+
trial_sentences = [current_sentences[i]] + overlap_sentences
247+
if _span_len(trial_positions, trial_sentences) <= self.chunk_overlap:
248+
overlap_sentences = trial_sentences
249+
overlap_positions = trial_positions
243250
else:
244251
break
245252
current_sentences = overlap_sentences
246253
sentence_positions = overlap_positions
247-
current_len = overlap_len
248254
else:
249255
current_sentences = []
250256
sentence_positions = []
251-
current_len = 0
252257

253258
# Add the sentence
254-
if current_len > 0:
255-
current_len += 1 # space separator
256-
current_len += sentence_len
257259
current_sentences.append(sentence)
258-
sentence_positions.append(sentence_start_map[sent_idx])
260+
sentence_positions.append(next_pos)
259261

260262
# Emit the final chunk
261263
if current_sentences:
262-
start_char = sentence_positions[0] if sentence_positions else 0
263-
last_sent_pos = sentence_positions[-1] if sentence_positions else start_char
264-
last_sent = current_sentences[-1] if current_sentences else ''
265-
end_char = last_sent_pos + len(last_sent)
264+
start_char = sentence_positions[0]
265+
end_char = sentence_positions[-1] + len(current_sentences[-1])
266266
chunk_text = text[start_char:end_char]
267267

268268
result.append(

test/nodes/test_chunker.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,37 @@ def test_repeated_sentences_no_overlap_in_positions(self):
257257
for i in range(1, len(chunks)):
258258
assert chunks[i]['metadata']['start_char'] >= chunks[i - 1]['metadata']['end_char']
259259

260+
def test_overlap_with_repeated_sentences_correct_spans(self):
261+
"""Overlap + repeated sentences: spans must match actual text and respect chunk_size."""
262+
chunker = SentenceChunker(chunk_size=20, chunk_overlap=10)
263+
text = 'Go. Go. Go. Go. Go. Go. Go. Go. Stop.'
264+
chunks = chunker.chunk(text)
265+
assert len(chunks) >= 2
266+
267+
for i, chunk in enumerate(chunks):
268+
meta = chunk['metadata']
269+
# The slice must match the chunk text exactly
270+
assert chunk['text'] == text[meta['start_char'] : meta['end_char']], f'Chunk {i} text mismatch: {chunk["text"]!r} != {text[meta["start_char"] : meta["end_char"]]!r}'
271+
# Actual span must not wildly exceed chunk_size (allow single-sentence overflow)
272+
actual_span = meta['end_char'] - meta['start_char']
273+
max_sentence_len = max(len(s) for s in ['Go.', 'Stop.'])
274+
assert actual_span <= chunker.chunk_size + max_sentence_len, f'Chunk {i} span {actual_span} exceeds chunk_size {chunker.chunk_size} + max_sentence {max_sentence_len}'
275+
276+
def test_overlap_with_multichar_whitespace_respects_chunk_size(self):
277+
"""Multi-char whitespace between sentences must not cause unbounded chunk growth."""
278+
chunker = SentenceChunker(chunk_size=20, chunk_overlap=10)
279+
text = 'A.\n\n\n\nB.\n\n\n\nC.\n\n\n\nD.\n\n\n\nE.'
280+
chunks = chunker.chunk(text)
281+
assert len(chunks) >= 2, f'Expected multiple chunks but got {len(chunks)}'
282+
283+
for i, chunk in enumerate(chunks):
284+
meta = chunk['metadata']
285+
actual_span = meta['end_char'] - meta['start_char']
286+
# With correct span tracking, no single chunk should swallow the entire text
287+
assert actual_span <= len(text), f'Chunk {i} span {actual_span} exceeds text length'
288+
# The slice must match
289+
assert chunk['text'] == text[meta['start_char'] : meta['end_char']]
290+
260291

261292
# ===========================================================================
262293
# TokenChunker tests

0 commit comments

Comments
 (0)