Skip to content

Commit 6ddb70c

Browse files
ndbroadbentclaude
andcommitted
feat: chunk long messages at parse time instead of truncating in context
Split messages >280 chars into multiple ParsedMessage objects at parse time, replacing the previous truncation approach in context-window.ts. Changes: - Add chunkMessage() helper with word-boundary-aware splitting - Add createChunkedMessages() shared helper for WhatsApp/iMessage parsers - Add chunkIndex field to ParsedMessage (undefined if not chunked) - Add MIN_CHUNK_LENGTH (32) - won't split if remainder too small - Remove truncateContent() from context-window.ts (no longer needed) - Add 16 new tests for chunking behavior Chunk format: - First chunk: "content…" - Middle chunks: "…content…" - Last chunk: "…content" 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 4090b8f commit 6ddb70c

8 files changed

Lines changed: 438 additions & 108 deletions

File tree

src/extraction/context-window.test.ts

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,7 @@
44

55
import { describe, expect, it } from 'vitest'
66
import type { ParsedMessage } from '../types'
7-
import {
8-
getMessageContext,
9-
isInContextWindow,
10-
MAX_MESSAGE_CHARS,
11-
MIN_CONTEXT_MESSAGES,
12-
TRUNCATION_MARKER,
13-
truncateContent
14-
} from './context-window'
7+
import { getMessageContext, isInContextWindow, MIN_CONTEXT_MESSAGES } from './context-window'
158

169
function createMessage(id: number, content: string, sender = 'User'): ParsedMessage {
1710
return {
@@ -25,24 +18,6 @@ function createMessage(id: number, content: string, sender = 'User'): ParsedMess
2518
}
2619
}
2720

28-
describe('truncateContent', () => {
29-
it('returns short messages unchanged', () => {
30-
const msg = 'Hello world'
31-
expect(truncateContent(msg)).toBe(msg)
32-
})
33-
34-
it('truncates messages over 280 chars', () => {
35-
const longMsg = 'x'.repeat(300)
36-
const result = truncateContent(longMsg)
37-
expect(result).toBe('x'.repeat(MAX_MESSAGE_CHARS) + TRUNCATION_MARKER)
38-
})
39-
40-
it('returns exactly 280 chars unchanged', () => {
41-
const msg = 'x'.repeat(280)
42-
expect(truncateContent(msg)).toBe(msg)
43-
})
44-
})
45-
4621
describe('getMessageContext', () => {
4722
it('throws for invalid index', () => {
4823
const messages = [createMessage(1, 'Hello')]

src/extraction/context-window.ts

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
* Rules:
99
* - Minimum 280 chars before and 280 chars after
1010
* - Minimum 2 messages on each side
11-
* - Each message truncated to max 280 chars with "[truncated to 280 chars]" suffix
11+
* - Messages are already chunked at parse time (≤280 chars each), no truncation needed
1212
* - Snaps to message boundaries
1313
* - Messages include timestamps in WhatsApp format so AI understands time gaps
1414
*/
@@ -17,8 +17,6 @@ import type { CandidateMessage, ContextMessage, ParsedMessage } from '../types'
1717

1818
const MIN_CONTEXT_CHARS = 280
1919
export const MIN_CONTEXT_MESSAGES = 2
20-
export const MAX_MESSAGE_CHARS = 280
21-
export const TRUNCATION_MARKER = ' [truncated to 280 chars]'
2220

2321
export interface MessageContext {
2422
/** Context messages before target */
@@ -34,21 +32,14 @@ export interface MessageContext {
3432
}
3533

3634
/**
37-
* Truncate content to max chars with marker.
38-
*/
39-
export function truncateContent(content: string): string {
40-
if (content.length <= MAX_MESSAGE_CHARS) return content
41-
return content.slice(0, MAX_MESSAGE_CHARS) + TRUNCATION_MARKER
42-
}
43-
44-
/**
45-
* Convert a ParsedMessage to a ContextMessage with truncated content.
35+
* Convert a ParsedMessage to a ContextMessage.
36+
* No truncation needed - messages are already chunked at parse time.
4637
*/
4738
function toContextMessage(msg: ParsedMessage): ContextMessage {
4839
return {
4940
id: msg.id,
5041
sender: msg.sender,
51-
content: truncateContent(msg.content),
42+
content: msg.content,
5243
timestamp: msg.timestamp
5344
}
5445
}

src/extraction/heuristics/context-window.test.ts

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
* Rules:
55
* - Minimum 280 chars before and 280 chars after
66
* - Minimum 2 messages on each side
7-
* - Each message truncated to max 280 chars with "[truncated to 280 chars]" suffix
8-
* - For prior context: snap to message boundaries, then truncate
7+
* - Messages are chunked at parse time (≤280 chars each), no truncation in context
8+
* - For prior context: snap to message boundaries
99
*/
1010

1111
import { readFileSync } from 'node:fs'
@@ -58,7 +58,7 @@ describe('Context Window', () => {
5858
expect(visitCandidate.contextAfter.length).toBeGreaterThanOrEqual(2)
5959
})
6060

61-
it('should truncate long messages with marker', async () => {
61+
it('should preserve long messages when remainder would be too small to chunk', async () => {
6262
const chat = readFileSync(join(FIXTURES_DIR, 'context-window.txt'), 'utf-8')
6363
const messages = parseWhatsAppChat(chat)
6464
const result = await extractCandidatesByHeuristics(messages)
@@ -71,11 +71,10 @@ describe('Context Window', () => {
7171
...candidate.contextAfter.map((m) => m.content)
7272
].join('\n')
7373

74-
// Long message about The Golden Fork should be truncated
75-
expect(allContext).toContain('[truncated to 280 chars]')
74+
// Long message about The Golden Fork is NOT chunked because remainder < 32 chars
75+
// All content is preserved in a single message
7676
expect(allContext).toContain('Golden Fork')
77-
// Should not contain text beyond 280 chars (the end of the message)
78-
expect(allContext).not.toContain('with friends and family')
77+
expect(allContext).toContain('friends and family')
7978
})
8079

8180
it('should get at least 280 chars of context before', async () => {

src/parser/imessage.ts

Lines changed: 38 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
*/
1717

1818
import type { ParsedMessage } from '../types'
19-
import { normalizeApostrophes } from './index'
19+
import { chunkMessage, createChunkedMessages, normalizeApostrophes } from './index'
2020

2121
// Timestamp line pattern: Apr 02, 2025 8:52:29 AM (optional read receipt)
2222
const TIMESTAMP_PATTERN =
@@ -100,23 +100,26 @@ interface MessageBuilder {
100100
type ParserState = 'timestamp' | 'sender' | 'content'
101101

102102
/**
103-
* Finalize a message builder into a ParsedMessage.
103+
* Finalize a message builder into ParsedMessage(s), chunking long content.
104+
* Returns an array of messages (multiple if content was chunked).
104105
*/
105-
function finalizeBuilder(builder: MessageBuilder, messageId: number): ParsedMessage | null {
106+
function finalizeBuilder(builder: MessageBuilder, startId: number): ParsedMessage[] {
106107
const content = builder.contentLines.join('\n').trim()
107-
if (content.length === 0) return null
108+
if (content.length === 0) return []
108109

109110
const urls = extractUrls(content)
110-
return {
111-
id: messageId,
111+
const rawLine = builder.rawLines.join('\n')
112+
const chunks = chunkMessage(content)
113+
114+
return createChunkedMessages(chunks, {
115+
startId,
112116
timestamp: builder.timestamp,
113117
sender: builder.sender,
114-
content,
115-
rawLine: builder.rawLines.join('\n'),
116-
hasMedia: false,
117-
urls: urls.length > 0 ? urls : undefined,
118-
source: 'imessage'
119-
}
118+
rawLine,
119+
source: 'imessage',
120+
urls,
121+
hasMedia: false
122+
})
120123
}
121124

122125
interface IMessageParserState {
@@ -138,12 +141,12 @@ function createInitialState(): IMessageParserState {
138141
function handleTimestampLine(
139142
timestampMatch: RegExpExecArray,
140143
parserState: IMessageParserState
141-
): ParsedMessage | null {
142-
let message: ParsedMessage | null = null
144+
): ParsedMessage[] {
145+
let messages: ParsedMessage[] = []
143146

144147
if (parserState.currentBuilder && parserState.currentBuilder.contentLines.length > 0) {
145-
message = finalizeBuilder(parserState.currentBuilder, parserState.messageId)
146-
if (message) parserState.messageId++
148+
messages = finalizeBuilder(parserState.currentBuilder, parserState.messageId)
149+
parserState.messageId += messages.length
147150
}
148151

149152
const [, dateStr, timeStr] = timestampMatch
@@ -153,7 +156,7 @@ function handleTimestampLine(
153156
parserState.state = 'sender'
154157
parserState.currentBuilder = null
155158

156-
return message
159+
return messages
157160
}
158161

159162
function handleSenderLine(
@@ -185,9 +188,9 @@ function handleContentLine(
185188
}
186189

187190
/**
188-
* Process a single line and return a message if a complete one was found.
191+
* Process a single line and return messages if a complete one was found.
189192
*/
190-
function processLine(line: string, parserState: IMessageParserState): ParsedMessage | null {
193+
function processLine(line: string, parserState: IMessageParserState): ParsedMessage[] {
191194
const trimmedLine = line.trim()
192195
const timestampMatch = TIMESTAMP_PATTERN.exec(trimmedLine)
193196

@@ -201,17 +204,17 @@ function processLine(line: string, parserState: IMessageParserState): ParsedMess
201204
handleContentLine(line, trimmedLine, parserState)
202205
}
203206

204-
return null
207+
return []
205208
}
206209

207210
/**
208-
* Finalize the parser state and return any remaining message.
211+
* Finalize the parser state and return any remaining messages.
209212
*/
210-
function finalizeParserState(parserState: IMessageParserState): ParsedMessage | null {
213+
function finalizeParserState(parserState: IMessageParserState): ParsedMessage[] {
211214
if (parserState.currentBuilder && parserState.currentBuilder.contentLines.length > 0) {
212215
return finalizeBuilder(parserState.currentBuilder, parserState.messageId)
213216
}
214-
return null
217+
return []
215218
}
216219

217220
/**
@@ -225,12 +228,12 @@ export function parseIMessageChat(raw: string): ParsedMessage[] {
225228
const parserState = createInitialState()
226229

227230
for (const line of lines) {
228-
const msg = processLine(line, parserState)
229-
if (msg) messages.push(msg)
231+
const parsed = processLine(line, parserState)
232+
messages.push(...parsed)
230233
}
231234

232-
const finalMsg = finalizeParserState(parserState)
233-
if (finalMsg) messages.push(finalMsg)
235+
const finalized = finalizeParserState(parserState)
236+
messages.push(...finalized)
234237

235238
return messages
236239
}
@@ -246,10 +249,14 @@ export async function* parseIMessageChatStream(
246249
for await (const rawLine of lines) {
247250
// Normalize apostrophe variants (curly → straight) for regex matching
248251
const line = normalizeApostrophes(rawLine)
249-
const msg = processLine(line, parserState)
250-
if (msg) yield msg
252+
const parsed = processLine(line, parserState)
253+
for (const msg of parsed) {
254+
yield msg
255+
}
251256
}
252257

253-
const finalMsg = finalizeParserState(parserState)
254-
if (finalMsg) yield finalMsg
258+
const finalized = finalizeParserState(parserState)
259+
for (const msg of finalized) {
260+
yield msg
261+
}
255262
}

0 commit comments

Comments
 (0)