Skip to content

Commit ae04768

Browse files
kimjune01claude
andcommitted
fix(core): harden union-find compaction after code review
Five bugs fixed, found via GPT-5.4 code review: 1. cosineSimilarity: handle mismatched vector dimensions (TF-IDF vocab grows over time, making newer vectors longer). Treats missing dims as zero; trailing dims still contribute to norm. 2. Forest.resolveDirty: replace _dirtyInputs.clear() with per-entry deletion and reference-equality guard. If union() replaces the inputs array during an async summarization, the stale summary is discarded. Combined inputs resolve in a future call. 3. TFIDFEmbedder.embedQuery: non-mutating embed for retrieval paths. render(query) was contaminating the corpus by calling embed(), which updates vocabulary and IDF. embedQuery uses current vocab without modifying state. 4. Forest.union centroid merging: handle mismatched embedding dimensions with Math.max and nullish coalescing. 5. ContextWindow constructor: reject evictAt < graduateAt (would corrupt _graduatedIndex tracking). 62/62 tests pass (11 new tests for the above). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent e538d6d commit ae04768

4 files changed

Lines changed: 259 additions & 6 deletions

File tree

packages/core/src/services/contextWindow.test.ts

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,25 @@ describe('cosineSimilarity', () => {
5252
it('should handle negative values', () => {
5353
expect(cosineSimilarity([1, 0], [-1, 0])).toBeCloseTo(-1.0);
5454
});
55+
56+
it('should handle mismatched dimensions without NaN', () => {
57+
const short = [1, 0];
58+
const long = [1, 0, 0.5, 0.3];
59+
const sim = cosineSimilarity(long, short);
60+
expect(Number.isNaN(sim)).toBe(false);
61+
expect(sim).toBeGreaterThan(0);
62+
// Symmetric
63+
expect(cosineSimilarity(short, long)).toBeCloseTo(sim);
64+
});
65+
66+
it('should return 0 for mismatched zero-overlap vectors', () => {
67+
// short has values only in dims 0-1, long only in dims 2-3
68+
const a = [1, 0];
69+
const b = [0, 0, 1, 0];
70+
const sim = cosineSimilarity(a, b);
71+
expect(Number.isNaN(sim)).toBe(false);
72+
expect(sim).toBeCloseTo(0.0);
73+
});
5574
});
5675

5776
// -- Forest --
@@ -281,6 +300,27 @@ describe('Forest', () => {
281300
expect(roots).toContain(root);
282301
});
283302

303+
it('should handle centroid merging with mismatched embedding dimensions', () => {
304+
const embedder: Embedder = {
305+
embed(text: string): number[] {
306+
// Simulate growing vocab: earlier messages have shorter embeddings
307+
if (text === 'early') return [1, 0];
308+
return [0.5, 0.5, 0.3]; // later messages have longer embeddings
309+
},
310+
};
311+
const forest = new Forest(embedder, stubSummarizer);
312+
forest.insert(0, 'early');
313+
forest.insert(1, 'later');
314+
forest.union(0, 1);
315+
316+
const root = forest.find(0);
317+
const centroid = forest.getCentroid(root);
318+
expect(centroid).toBeDefined();
319+
expect(centroid!.every((v) => !Number.isNaN(v))).toBe(true);
320+
// Merged centroid should have max dimension length
321+
expect(centroid!.length).toBe(3);
322+
});
323+
284324
it('should return no-op for union of same cluster', () => {
285325
const forest = new Forest(stubEmbedder, stubSummarizer);
286326
forest.insert(0, 'a');
@@ -400,6 +440,72 @@ describe('Forest', () => {
400440
expect(members).toContain(1);
401441
});
402442

443+
it('should not drop dirty state added by union during resolveDirty', async () => {
444+
const slowSummarizer: Summarizer = {
445+
async summarize(messages: string[]): Promise<string> {
446+
// Simulate slow LLM call
447+
await new Promise((resolve) => setTimeout(resolve, 10));
448+
return messages.join('; ');
449+
},
450+
};
451+
const forest = new Forest(stubEmbedder, slowSummarizer);
452+
forest.insert(0, 'a');
453+
forest.insert(1, 'b');
454+
forest.union(0, 1); // dirty cluster {0,1}
455+
456+
// Start resolving — the await inside gives us a window
457+
const resolvePromise = forest.resolveDirty();
458+
459+
// While resolve is in flight, add new dirty state
460+
forest.insert(2, 'c');
461+
forest.insert(3, 'd');
462+
forest.union(2, 3); // new dirty cluster {2,3}
463+
464+
await resolvePromise;
465+
466+
// The new dirty cluster should NOT have been wiped
467+
expect(forest.isDirty(forest.find(2))).toBe(true);
468+
469+
// Resolve it now
470+
await forest.resolveDirty();
471+
expect(forest.isDirty(forest.find(2))).toBe(false);
472+
});
473+
474+
it('should not overwrite merged cluster dirty state when in-flight root is merged', async () => {
475+
const slowSummarizer: Summarizer = {
476+
async summarize(messages: string[]): Promise<string> {
477+
await new Promise((resolve) => setTimeout(resolve, 10));
478+
return messages.join('; ');
479+
},
480+
};
481+
const forest = new Forest(stubEmbedder, slowSummarizer);
482+
forest.insert(0, 'a');
483+
forest.insert(1, 'b');
484+
forest.union(0, 1); // dirty cluster {0,1}
485+
const originalRoot = forest.find(0);
486+
487+
// Start resolving {0,1}
488+
const resolvePromise = forest.resolveDirty();
489+
490+
// While {0,1} is being summarized, merge it into a new cluster
491+
forest.insert(2, 'c');
492+
forest.union(originalRoot, 2); // now {0,1,2} is dirty with combined inputs
493+
494+
await resolvePromise;
495+
496+
// The merged cluster should still be dirty — the stale summary
497+
// from the in-flight call should NOT have resolved it
498+
const mergedRoot = forest.find(0);
499+
expect(forest.isDirty(mergedRoot)).toBe(true);
500+
501+
// Resolve it properly now
502+
await forest.resolveDirty();
503+
expect(forest.isDirty(forest.find(0))).toBe(false);
504+
// Summary should include all three messages
505+
const summary = forest.summary(forest.find(0))!;
506+
expect(summary).toBeDefined();
507+
});
508+
403509
it('should list all roots', () => {
404510
const forest = new Forest(stubEmbedder, stubSummarizer);
405511
forest.insert(0, 'a');
@@ -688,6 +794,40 @@ describe('ContextWindow', () => {
688794
expect(cw.totalMessages).toBe(2);
689795
});
690796

797+
it('render(query) should not mutate the embedder corpus', () => {
798+
const embedder = {
799+
embed(text: string): number[] {
800+
if (text.includes('cat')) return [1, 0, 0];
801+
return [0, 0, 1];
802+
},
803+
embedQuery: vi.fn().mockReturnValue([1, 0, 0]),
804+
};
805+
806+
const cw = new ContextWindow(embedder, stubSummarizer, {
807+
graduateAt: 2,
808+
evictAt: 4,
809+
maxColdClusters: 10,
810+
mergeThreshold: 0.0,
811+
});
812+
813+
cw.append('cat info');
814+
cw.append('dog info');
815+
cw.append('hot1');
816+
817+
// render with query should call embedQuery, not embed
818+
cw.render('cat question', 1, 0.0);
819+
expect(embedder.embedQuery).toHaveBeenCalledWith('cat question');
820+
});
821+
822+
it('should throw if evictAt < graduateAt', () => {
823+
expect(() => {
824+
new ContextWindow(stubEmbedder, stubSummarizer, {
825+
graduateAt: 5,
826+
evictAt: 3,
827+
});
828+
}).toThrow('evictAt (3) must be >= graduateAt (5)');
829+
});
830+
691831
it('should expose forest for direct access', () => {
692832
const cw = new ContextWindow(stubEmbedder, stubSummarizer);
693833
expect(cw.forest).toBeInstanceOf(Forest);

packages/core/src/services/contextWindow.ts

Lines changed: 53 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77
/**
88
* Union-find context compaction with overlap window and deferred summarization.
99
*
10+
* Reading order for reviewers:
11+
* 1. cosineSimilarity() — handles mismatched vector dimensions safely
12+
* 2. Forest class — union-find with path compression, deferred summarization
13+
* 3. ContextWindow class — overlap window, graduation/eviction
14+
* 4. Integration: chatCompressionService.ts compactWithUnionFind()
15+
*
1016
* v2 architecture:
1117
* - append() is synchronous — no LLM calls. Graduation triggers structural
1218
* union() only.
@@ -15,13 +21,17 @@
1521
* in background during main LLM call wait.
1622
* - Overlap window (graduateAt/evictAt): graduated messages stay in hot zone
1723
* for ~2 turns. By the time they evict, background resolveDirty() has
18-
* resolved their cluster summaries. Zero blocking, zero staleness.
24+
* resolved their cluster summaries.
25+
*
26+
* Design doc: https://github.com/kimjune01/union-find-compaction-for-gemini-cli/blob/main/transformation-design.md
1927
*/
2028

2129
// -- Interfaces --
2230

2331
export interface Embedder {
2432
embed(text: string): number[];
33+
/** Embed without mutating internal state. Used for queries/retrieval. */
34+
embedQuery?(text: string): number[];
2535
}
2636

2737
export interface Summarizer {
@@ -41,15 +51,27 @@ export interface Message {
4151

4252
// -- Helpers --
4353

54+
// TF-IDF vocabulary grows over time, so newer vectors are longer than older
55+
// ones. We handle mismatched dimensions by treating missing entries as zero:
56+
// only shared dimensions contribute to the dot product, but trailing dimensions
57+
// still contribute to the norm (lowering similarity, as expected).
4458
export function cosineSimilarity(a: number[], b: number[]): number {
59+
const len = Math.min(a.length, b.length);
4560
let dot = 0;
4661
let normA = 0;
4762
let normB = 0;
48-
for (let i = 0; i < a.length; i++) {
63+
for (let i = 0; i < len; i++) {
4964
dot += a[i] * b[i];
5065
normA += a[i] * a[i];
5166
normB += b[i] * b[i];
5267
}
68+
// Include trailing dimensions from the longer vector in its norm
69+
for (let i = len; i < a.length; i++) {
70+
normA += a[i] * a[i];
71+
}
72+
for (let i = len; i < b.length; i++) {
73+
normB += b[i] * b[i];
74+
}
5375
normA = Math.sqrt(normA);
5476
normB = Math.sqrt(normB);
5577
if (normA === 0 || normB === 0) return 0.0;
@@ -167,7 +189,12 @@ export class Forest {
167189
if (ca && cb) {
168190
const na = membersA.length - membersB.length;
169191
const nb = membersB.length;
170-
const merged = ca.map((v, i) => (v * na + cb[i] * nb) / (na + nb));
192+
const total = na + nb;
193+
const maxLen = Math.max(ca.length, cb.length);
194+
const merged = new Array<number>(maxLen);
195+
for (let i = 0; i < maxLen; i++) {
196+
merged[i] = ((ca[i] ?? 0) * na + (cb[i] ?? 0) * nb) / total;
197+
}
171198
this._centroids.set(rootA, merged);
172199
}
173200

@@ -200,14 +227,26 @@ export class Forest {
200227
/**
201228
* Batch-summarize all dirty clusters. One LLM call per dirty root.
202229
* Called as fire-and-forget after render(), runs during main LLM call wait.
230+
*
231+
* Concurrency safety: union() can run between awaits (JS is single-threaded
232+
* but yields at each await). When union() merges into a dirty root, it
233+
* replaces _dirtyInputs with a new array containing combined content.
234+
* We detect this via reference equality (=== check on the inputs array).
235+
* If the array changed, we skip — the combined entry resolves next call.
203236
*/
204237
async resolveDirty(): Promise<void> {
205238
const entries = [...this._dirtyInputs.entries()];
206239
for (const [root, inputs] of entries) {
240+
if (!this._dirtyInputs.has(root)) continue;
207241
const summary = await this._summarizer.summarize(inputs);
208-
this._summaries.set(root, summary);
242+
if (this._dirtyInputs.get(root) === inputs) {
243+
this._summaries.set(root, summary);
244+
this._dirtyInputs.delete(root);
245+
}
246+
// If union() replaced the inputs (merged new content into this root)
247+
// or merged this root away, skip — the combined dirty entry will be
248+
// resolved in a future resolveDirty() call.
209249
}
210-
this._dirtyInputs.clear();
211250
}
212251

213252
/** Whether this cluster has unsummarized content. */
@@ -329,6 +368,11 @@ export class ContextWindow {
329368
this._evictAt = options.evictAt ?? 30;
330369
this._maxColdClusters = options.maxColdClusters ?? 10;
331370
this._mergeThreshold = options.mergeThreshold ?? 0.15;
371+
if (this._evictAt < this._graduateAt) {
372+
throw new Error(
373+
`evictAt (${this._evictAt}) must be >= graduateAt (${this._graduateAt})`,
374+
);
375+
}
332376
}
333377

334378
/**
@@ -420,7 +464,10 @@ export class ContextWindow {
420464
let cold: string[];
421465

422466
if (query != null && this._forest.clusterCount() > 0) {
423-
const queryEmb = this._embedder.embed(query);
467+
// Use embedQuery (non-mutating) to avoid contaminating the TF-IDF corpus.
468+
// embed() would add query terms to the vocabulary, changing future embeddings.
469+
const embedFn = this._embedder.embedQuery ?? this._embedder.embed;
470+
const queryEmb = embedFn.call(this._embedder, query);
424471
const topRoots = this._forest.nearest(queryEmb, k, minSim);
425472
cold = topRoots.map((r) => this._forest.compact(r));
426473
} else {

packages/core/src/services/embeddingService.test.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,32 @@ describe('TFIDFEmbedder', () => {
8686
}
8787
});
8888

89+
it('embedQuery should not mutate vocabulary or doc count', () => {
90+
const embedder = new TFIDFEmbedder();
91+
embedder.embed('alpha beta');
92+
embedder.embed('gamma delta');
93+
const vocabBefore = embedder.getVocabulary().length;
94+
95+
// embedQuery with new terms should not grow vocab
96+
const qvec = embedder.embedQuery('epsilon zeta');
97+
const vocabAfter = embedder.getVocabulary().length;
98+
99+
expect(vocabAfter).toBe(vocabBefore);
100+
// Unknown terms should produce a zero vector (no known terms matched)
101+
expect(qvec.every((v) => v === 0)).toBe(true);
102+
});
103+
104+
it('embedQuery should use existing vocabulary for known terms', () => {
105+
const embedder = new TFIDFEmbedder();
106+
embedder.embed('cat dog fish');
107+
108+
const qvec = embedder.embedQuery('cat');
109+
// Should produce non-zero vector since 'cat' is in vocab
110+
expect(qvec.some((v) => v !== 0)).toBe(true);
111+
// Same dimension as current vocab
112+
expect(qvec.length).toBe(embedder.getVocabulary().length);
113+
});
114+
89115
it('should normalize vectors', () => {
90116
const embedder = new TFIDFEmbedder();
91117
const vec = embedder.embed('test normalization vector');

packages/core/src/services/embeddingService.ts

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,46 @@ export class TFIDFEmbedder implements Embedder {
6868
return vec;
6969
}
7070

71+
/**
72+
* Embed without mutating vocabulary, docCount, or termDocFreq.
73+
* Used for queries/retrieval so searching doesn't contaminate the corpus.
74+
*/
75+
embedQuery(text: string): number[] {
76+
const tokens = this._tokenize(text);
77+
if (tokens.length === 0) {
78+
return new Array<number>(Math.max(this._vocab.size, 1)).fill(0);
79+
}
80+
81+
const vec = new Array<number>(this._vocab.size).fill(0);
82+
const termFreq = new Map<string, number>();
83+
84+
for (const token of tokens) {
85+
termFreq.set(token, (termFreq.get(token) ?? 0) + 1);
86+
}
87+
88+
for (const [term, count] of termFreq) {
89+
const idx = this._vocab.get(term);
90+
if (idx === undefined) continue; // unknown terms ignored
91+
92+
const tf = count / tokens.length;
93+
const df = this._termDocFreq.get(term) ?? 1;
94+
const idf = Math.log(1 + this._docCount / df);
95+
vec[idx] = tf * idf;
96+
}
97+
98+
// L2 normalize
99+
const norm = Math.sqrt(
100+
vec.reduce((sum: number, v: number) => sum + v * v, 0),
101+
);
102+
if (norm > 0) {
103+
for (let i = 0; i < vec.length; i++) {
104+
vec[i] /= norm;
105+
}
106+
}
107+
108+
return vec;
109+
}
110+
71111
getVocabulary(): string[] {
72112
const vocab = new Array<string>(this._vocab.size);
73113
for (const [term, idx] of this._vocab) {

0 commit comments

Comments
 (0)