-
Notifications
You must be signed in to change notification settings - Fork 15
Expand file tree
/
Copy pathembedding-strategy.test.ts
More file actions
531 lines (438 loc) · 18.9 KB
/
Copy pathembedding-strategy.test.ts
File metadata and controls
531 lines (438 loc) · 18.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
import Database from 'better-sqlite3';
import { afterAll, beforeAll, describe, expect, test, vi } from 'vitest';
import { initSchema } from '../../src/db/index.js';
// ─── Mock setup ────────────────────────────────────────────────────────
// Capture texts passed to the embedding model
const { EMBEDDED_TEXTS } = vi.hoisted(() => ({
EMBEDDED_TEXTS: [],
}));
vi.mock('@huggingface/transformers', () => ({
pipeline: async () => async (batch) => {
const dim = 384;
const data = new Float32Array(dim * batch.length);
for (let t = 0; t < batch.length; t++) {
EMBEDDED_TEXTS.push(batch[t]);
data[t * dim] = 0.5;
data[t * dim + 1] = 0.3;
}
return { data };
},
cos_sim: () => 0,
}));
import {
buildEmbeddings,
EMBEDDING_STRATEGIES,
estimateTokens,
MODELS,
} from '../../src/domain/search/index.js';
// ─── Helpers ───────────────────────────────────────────────────────────
function insertNode(db, name, kind, file, line, endLine) {
return db
.prepare('INSERT INTO nodes (name, kind, file, line, end_line) VALUES (?, ?, ?, ?, ?)')
.run(name, kind, file, line, endLine).lastInsertRowid;
}
function insertEdge(db, sourceId, targetId, kind) {
db.prepare('INSERT INTO edges (source_id, target_id, kind) VALUES (?, ?, ?)').run(
sourceId,
targetId,
kind,
);
}
// ─── Fixture ───────────────────────────────────────────────────────────
// Source files that match the DB nodes
const FIXTURE_FILES = {
'math.js': [
'/**',
' * Add two numbers together.',
' */',
'export function add(a, b) { return a + b; }',
'export function multiply(a, b) { return a * b; }',
'export function square(x) { return multiply(x, x); }',
].join('\n'),
'utils.js': [
"import { add, square } from './math.js';",
'export function sumOfSquares(a, b) { return add(square(a), square(b)); }',
'export class Calculator {',
' compute(x, y) { return sumOfSquares(x, y); }',
'}',
].join('\n'),
};
let tmpDir: string, dbPath: string;
beforeAll(() => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-strategy-test-'));
// Write source files
for (const [name, content] of Object.entries(FIXTURE_FILES)) {
fs.writeFileSync(path.join(tmpDir, name), content);
}
// Create DB with nodes + edges
const dbDir = path.join(tmpDir, '.codegraph');
fs.mkdirSync(dbDir, { recursive: true });
dbPath = path.join(dbDir, 'graph.db');
const db = new Database(dbPath);
db.pragma('journal_mode = WAL');
initSchema(db);
// math.js nodes (line numbers are 1-indexed)
const addId = insertNode(db, 'add', 'function', 'math.js', 4, 4);
const multiplyId = insertNode(db, 'multiply', 'function', 'math.js', 5, 5);
const squareId = insertNode(db, 'square', 'function', 'math.js', 6, 6);
// utils.js nodes
const sumOfSquaresId = insertNode(db, 'sumOfSquares', 'function', 'utils.js', 2, 2);
insertNode(db, 'Calculator', 'class', 'utils.js', 3, 5);
const computeId = insertNode(db, 'compute', 'method', 'utils.js', 4, 4);
// Call edges: square → multiply, sumOfSquares → add, sumOfSquares → square, compute → sumOfSquares
insertEdge(db, squareId, multiplyId, 'calls');
insertEdge(db, sumOfSquaresId, addId, 'calls');
insertEdge(db, sumOfSquaresId, squareId, 'calls');
insertEdge(db, computeId, sumOfSquaresId, 'calls');
db.close();
});
afterAll(() => {
if (tmpDir) fs.rmSync(tmpDir, { recursive: true, force: true });
});
// ─── Tests ─────────────────────────────────────────────────────────────
describe('EMBEDDING_STRATEGIES', () => {
test('exports valid strategies', () => {
expect(EMBEDDING_STRATEGIES).toContain('structured');
expect(EMBEDDING_STRATEGIES).toContain('source');
});
});
describe('estimateTokens', () => {
test('estimates ~4 chars per token', () => {
expect(estimateTokens('abcd')).toBe(1);
expect(estimateTokens('abcdefgh')).toBe(2);
expect(estimateTokens('a'.repeat(100))).toBe(25);
});
test('rounds up', () => {
expect(estimateTokens('abcde')).toBe(2);
});
test('handles empty string', () => {
expect(estimateTokens('')).toBe(0);
});
});
describe('MODELS contextWindow', () => {
test('every model has a contextWindow', () => {
for (const [key, config] of Object.entries(MODELS)) {
expect(config.contextWindow, `${key} missing contextWindow`).toBeGreaterThan(0);
}
});
test('jina-code points to the published code embedding model', () => {
expect(MODELS['jina-code'].name).toBe('jinaai/jina-embeddings-v2-base-code');
});
});
describe('buildEmbeddings with structured strategy', () => {
test('produces embeddings with graph context', async () => {
EMBEDDED_TEXTS.length = 0;
await buildEmbeddings(tmpDir, 'minilm', dbPath, { strategy: 'structured' });
expect(EMBEDDED_TEXTS.length).toBeGreaterThan(0);
// square calls multiply → should appear in structured text
const squareText = EMBEDDED_TEXTS.find((t) => t.startsWith('function square'));
expect(squareText).toBeDefined();
expect(squareText).toContain('Calls:');
expect(squareText).toContain('multiply');
// sumOfSquares calls add and square → should appear
const sosText = EMBEDDED_TEXTS.find((t) => t.startsWith('function sumOfSquares'));
expect(sosText).toBeDefined();
expect(sosText).toContain('Calls:');
expect(sosText).toContain('add');
expect(sosText).toContain('square');
// sumOfSquares is called by compute → should appear
expect(sosText).toContain('Called by:');
expect(sosText).toContain('compute');
});
test('extracts leading comments', async () => {
// add has a JSDoc comment above it: "Add two numbers together."
const addText = EMBEDDED_TEXTS.find((t) => t.startsWith('function add'));
expect(addText).toBeDefined();
expect(addText).toContain('Add two numbers together');
});
test('extracts parameters from signature', async () => {
const addText = EMBEDDED_TEXTS.find((t) => t.startsWith('function add'));
expect(addText).toBeDefined();
expect(addText).toContain('Parameters:');
expect(addText).toContain('a, b');
});
test('stores strategy in metadata', async () => {
const db = new Database(dbPath, { readonly: true });
const row = db.prepare("SELECT value FROM embedding_meta WHERE key = 'strategy'").get();
db.close();
expect(row.value).toBe('structured');
});
test('structured texts are compact', () => {
for (const text of EMBEDDED_TEXTS) {
const tokens = estimateTokens(text);
expect(tokens).toBeLessThan(200);
}
});
});
describe('buildEmbeddings with source strategy', () => {
test('produces embeddings with raw source code', async () => {
EMBEDDED_TEXTS.length = 0;
await buildEmbeddings(tmpDir, 'minilm', dbPath, { strategy: 'source' });
expect(EMBEDDED_TEXTS.length).toBeGreaterThan(0);
// Source strategy should NOT have graph context lines
const squareText = EMBEDDED_TEXTS.find((t) => t.startsWith('function square'));
expect(squareText).toBeDefined();
expect(squareText).not.toContain('Calls:');
expect(squareText).not.toContain('Called by:');
expect(squareText).toContain('return');
});
test('stores strategy in metadata', async () => {
const db = new Database(dbPath, { readonly: true });
const row = db.prepare("SELECT value FROM embedding_meta WHERE key = 'strategy'").get();
db.close();
expect(row.value).toBe('source');
});
});
describe('buildEmbeddings defaults to structured', () => {
test('no options → structured strategy', async () => {
EMBEDDED_TEXTS.length = 0;
await buildEmbeddings(tmpDir, 'minilm', dbPath);
const db = new Database(dbPath, { readonly: true });
const row = db.prepare("SELECT value FROM embedding_meta WHERE key = 'strategy'").get();
db.close();
expect(row.value).toBe('structured');
});
});
describe('FTS5 index built alongside embeddings', () => {
test('full_text column is populated in embeddings table', async () => {
EMBEDDED_TEXTS.length = 0;
await buildEmbeddings(tmpDir, 'minilm', dbPath, { strategy: 'structured' });
const db = new Database(dbPath, { readonly: true });
const rows = db.prepare('SELECT full_text FROM embeddings WHERE full_text IS NOT NULL').all();
db.close();
expect(rows.length).toBeGreaterThan(0);
// Each full_text should contain structured text content
for (const row of rows) {
expect(row.full_text.length).toBeGreaterThan(0);
}
});
test('FTS5 row count matches embedding count', async () => {
const db = new Database(dbPath, { readonly: true });
const embCount = db.prepare('SELECT COUNT(*) as c FROM embeddings').get().c;
const ftsCount = db.prepare('SELECT COUNT(*) as c FROM fts_index').get().c;
db.close();
expect(ftsCount).toBe(embCount);
});
test('FTS5 content matches the structured/source text', async () => {
const db = new Database(dbPath, { readonly: true });
// FTS5 rowid matches embeddings.node_id
const emb = db.prepare('SELECT node_id, full_text FROM embeddings').all();
for (const row of emb) {
const fts = db.prepare('SELECT content FROM fts_index WHERE rowid = ?').get(row.node_id);
expect(fts).toBeDefined();
expect(fts.content).toBe(row.full_text);
}
db.close();
});
test('fts_count is stored in metadata', async () => {
const db = new Database(dbPath, { readonly: true });
const row = db.prepare("SELECT value FROM embedding_meta WHERE key = 'fts_count'").get();
db.close();
expect(row).toBeDefined();
expect(Number(row.value)).toBeGreaterThan(0);
});
test('FTS5 name column contains symbol names', async () => {
const db = new Database(dbPath, { readonly: true });
const results = db
.prepare("SELECT rowid, name FROM fts_index WHERE fts_index MATCH 'add'")
.all();
db.close();
expect(results.length).toBeGreaterThan(0);
const names = results.map((r) => r.name);
expect(names).toContain('add');
});
});
describe('absolute file paths in DB (#752)', () => {
let absDir: string, absDbPath: string;
beforeAll(() => {
absDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-abspath-test-'));
fs.writeFileSync(path.join(absDir, 'math.js'), 'export function add(a, b) { return a + b; }\n');
const absDbDir = path.join(absDir, '.codegraph');
fs.mkdirSync(absDbDir, { recursive: true });
absDbPath = path.join(absDbDir, 'graph.db');
const db = new Database(absDbPath);
db.pragma('journal_mode = WAL');
initSchema(db);
// Insert node with an absolute file path (as the native engine does)
const absFile = path.join(absDir, 'math.js');
insertNode(db, 'add', 'function', absFile, 1, 1);
db.close();
});
afterAll(() => {
if (absDir) fs.rmSync(absDir, { recursive: true, force: true });
});
test('produces embeddings when DB stores absolute paths (structured)', async () => {
EMBEDDED_TEXTS.length = 0;
await buildEmbeddings(absDir, 'minilm', absDbPath);
expect(EMBEDDED_TEXTS.length).toBe(1);
const db = new Database(absDbPath, { readonly: true });
const count = db.prepare('SELECT COUNT(*) as c FROM embeddings').get().c;
db.close();
expect(count).toBe(1);
});
test('produces embeddings when DB stores absolute paths (source)', async () => {
EMBEDDED_TEXTS.length = 0;
await buildEmbeddings(absDir, 'minilm', absDbPath, { strategy: 'source' });
expect(EMBEDDED_TEXTS.length).toBe(1);
expect(EMBEDDED_TEXTS[0]).toContain('add');
});
});
describe('embed resolves source files from DB root, not cwd (#983)', () => {
let repoDir: string, otherDir: string, repoDbPath: string;
let originalCwd: string;
beforeAll(() => {
// Repo that was built (files live here)
repoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-embed983-repo-'));
// Unrelated directory we'll cd into when running embed
otherDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-embed983-other-'));
fs.writeFileSync(
path.join(repoDir, 'a.js'),
'export function alpha() { return 1; }\nexport function beta() { return alpha(); }\n',
);
const dbDir = path.join(repoDir, '.codegraph');
fs.mkdirSync(dbDir, { recursive: true });
repoDbPath = path.join(dbDir, 'graph.db');
const db = new Database(repoDbPath);
db.pragma('journal_mode = WAL');
initSchema(db);
// DB stores *relative* file paths (typical of WASM-engine builds)
insertNode(db, 'alpha', 'function', 'a.js', 1, 1);
insertNode(db, 'beta', 'function', 'a.js', 2, 2);
// Persist the repo root as the build pipeline would
db.prepare('INSERT OR REPLACE INTO build_meta (key, value) VALUES (?, ?)').run(
'root_dir',
path.resolve(repoDir),
);
db.close();
originalCwd = process.cwd();
});
afterAll(() => {
try {
process.chdir(originalCwd);
} catch {
/* ignore */
}
if (repoDir) fs.rmSync(repoDir, { recursive: true, force: true });
if (otherDir) fs.rmSync(otherDir, { recursive: true, force: true });
});
test('uses root_dir metadata when embed is invoked from unrelated cwd', async () => {
EMBEDDED_TEXTS.length = 0;
process.chdir(otherDir);
// Simulate the CLI: positional dir defaults to cwd (here: otherDir), DB path is absolute
await buildEmbeddings(process.cwd(), 'minilm', repoDbPath);
expect(EMBEDDED_TEXTS.length).toBe(2);
const db = new Database(repoDbPath, { readonly: true });
const count = db.prepare('SELECT COUNT(*) as c FROM embeddings').get().c;
db.close();
expect(count).toBe(2);
});
test('falls back to <dbPath>/../.. when root_dir metadata is missing', async () => {
// Build a fresh DB without root_dir metadata
const legacyRepo = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-embed983-legacy-'));
try {
fs.writeFileSync(path.join(legacyRepo, 'b.js'), 'export function gamma() { return 42; }\n');
const legacyDbDir = path.join(legacyRepo, '.codegraph');
fs.mkdirSync(legacyDbDir, { recursive: true });
const legacyDb = path.join(legacyDbDir, 'graph.db');
const db = new Database(legacyDb);
db.pragma('journal_mode = WAL');
initSchema(db);
insertNode(db, 'gamma', 'function', 'b.js', 1, 1);
// Deliberately NOT writing root_dir — simulates DB built before #983 fix
db.close();
EMBEDDED_TEXTS.length = 0;
process.chdir(otherDir);
await buildEmbeddings(process.cwd(), 'minilm', legacyDb);
expect(EMBEDDED_TEXTS.length).toBe(1);
} finally {
fs.rmSync(legacyRepo, { recursive: true, force: true });
}
});
test('exits non-zero (throws) when no source files can be read', async () => {
// Build a DB pointing at files that no longer exist
const ghostRepo = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-embed983-ghost-'));
try {
const ghostDbDir = path.join(ghostRepo, '.codegraph');
fs.mkdirSync(ghostDbDir, { recursive: true });
const ghostDb = path.join(ghostDbDir, 'graph.db');
const db = new Database(ghostDb);
db.pragma('journal_mode = WAL');
initSchema(db);
insertNode(db, 'missing', 'function', 'does-not-exist.js', 1, 1);
db.prepare('INSERT OR REPLACE INTO build_meta (key, value) VALUES (?, ?)').run(
'root_dir',
path.resolve(ghostRepo),
);
db.close();
EMBEDDED_TEXTS.length = 0;
await expect(buildEmbeddings(ghostRepo, 'minilm', ghostDb)).rejects.toThrow(
/could not read any of the .* source files/,
);
// No embeddings were persisted (they would have been overwritten via DELETE)
const readDb = new Database(ghostDb, { readonly: true });
const count = readDb.prepare('SELECT COUNT(*) as c FROM embeddings').get().c;
readDb.close();
expect(count).toBe(0);
} finally {
fs.rmSync(ghostRepo, { recursive: true, force: true });
}
});
});
describe('context window overflow detection', () => {
let bigDir: string, bigDbPath: string;
beforeAll(() => {
// Create a file with a very large function that will overflow minilm's 256-token window
bigDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-overflow-test-'));
const bigFn =
'export function bigFunction(x) {\n' +
' const data = [];\n'.repeat(400) +
' return data;\n}\n';
fs.writeFileSync(path.join(bigDir, 'big.js'), bigFn);
const bigDbDir = path.join(bigDir, '.codegraph');
fs.mkdirSync(bigDbDir, { recursive: true });
bigDbPath = path.join(bigDbDir, 'graph.db');
const db = new Database(bigDbPath);
db.pragma('journal_mode = WAL');
initSchema(db);
insertNode(db, 'bigFunction', 'function', 'big.js', 1, 403);
db.close();
});
afterAll(() => {
if (bigDir) fs.rmSync(bigDir, { recursive: true, force: true });
});
test('warns and truncates when source text exceeds context window', async () => {
const warnSpy = vi.spyOn(process.stderr, 'write').mockImplementation(() => true);
EMBEDDED_TEXTS.length = 0;
await buildEmbeddings(bigDir, 'minilm', bigDbPath, { strategy: 'source' });
const warnOutput = warnSpy.mock.calls.map((c) => c[0]).join('');
warnSpy.mockRestore();
expect(warnOutput).toContain('exceeded model context window');
expect(warnOutput).toContain('truncated');
// Text should be truncated to fit minilm's 256-token ≈ 1024 char limit
const bigText = EMBEDDED_TEXTS.find((t) => t.includes('bigFunction'));
expect(bigText).toBeDefined();
expect(bigText.length).toBeLessThanOrEqual(256 * 4);
// Metadata records truncation count
const db = new Database(bigDbPath, { readonly: true });
const row = db.prepare("SELECT value FROM embedding_meta WHERE key = 'truncated_count'").get();
db.close();
expect(row).toBeDefined();
expect(Number(row.value)).toBeGreaterThan(0);
});
test('structured strategy avoids overflow for same function', async () => {
const warnSpy = vi.spyOn(process.stderr, 'write').mockImplementation(() => true);
EMBEDDED_TEXTS.length = 0;
await buildEmbeddings(bigDir, 'minilm', bigDbPath, { strategy: 'structured' });
const warnOutput = warnSpy.mock.calls.map((c) => c[0]).join('');
warnSpy.mockRestore();
// Structured strategy only uses first few lines + graph context → should NOT overflow
const bigText = EMBEDDED_TEXTS.find((t) => t.includes('bigFunction'));
expect(bigText).toBeDefined();
expect(estimateTokens(bigText)).toBeLessThan(256);
// No truncation warning expected
expect(warnOutput).not.toContain('exceeded model context window');
});
});