diff --git a/src/cli.js b/src/cli.js index 00c5431b5..b5a482218 100644 --- a/src/cli.js +++ b/src/cli.js @@ -8,7 +8,7 @@ import { buildGraph } from './builder.js'; import { loadConfig } from './config.js'; import { findCycles, formatCycles } from './cycles.js'; import { findDbPath } from './db.js'; -import { buildEmbeddings, MODELS, search } from './embedder.js'; +import { buildEmbeddings, EMBEDDING_STRATEGIES, MODELS, search } from './embedder.js'; import { exportDOT, exportJSON, exportMermaid } from './export.js'; import { setVerbose } from './logger.js'; import { @@ -418,9 +418,12 @@ program console.log('\nAvailable embedding models:\n'); for (const [key, config] of Object.entries(MODELS)) { const def = key === 'minilm' ? ' (default)' : ''; - console.log(` ${key.padEnd(12)} ${String(config.dim).padStart(4)}d ${config.desc}${def}`); + const ctx = config.contextWindow ? `${config.contextWindow} ctx` : ''; + console.log( + ` ${key.padEnd(12)} ${String(config.dim).padStart(4)}d ${ctx.padEnd(9)} ${config.desc}${def}`, + ); } - console.log('\nUsage: codegraph embed --model '); + console.log('\nUsage: codegraph embed --model --strategy '); console.log(' codegraph search "query" --model \n'); }); @@ -434,9 +437,20 @@ program 'Embedding model: minilm (default), jina-small, jina-base, jina-code, nomic, nomic-v1.5, bge-large. Run `codegraph models` for details', 'minilm', ) + .option( + '-s, --strategy ', + `Embedding strategy: ${EMBEDDING_STRATEGIES.join(', ')}. "structured" uses graph context (callers/callees), "source" embeds raw code`, + 'structured', + ) .action(async (dir, opts) => { + if (!EMBEDDING_STRATEGIES.includes(opts.strategy)) { + console.error( + `Unknown strategy: ${opts.strategy}. Available: ${EMBEDDING_STRATEGIES.join(', ')}`, + ); + process.exit(1); + } const root = path.resolve(dir || '.'); - await buildEmbeddings(root, opts.model); + await buildEmbeddings(root, opts.model, undefined, { strategy: opts.strategy }); }); program diff --git a/src/embedder.js b/src/embedder.js index 6876a00e4..67eb39e5d 100644 --- a/src/embedder.js +++ b/src/embedder.js @@ -26,47 +26,56 @@ export const MODELS = { minilm: { name: 'Xenova/all-MiniLM-L6-v2', dim: 384, + contextWindow: 256, desc: 'Smallest, fastest (~23MB). General text.', quantized: true, }, 'jina-small': { name: 'Xenova/jina-embeddings-v2-small-en', dim: 512, + contextWindow: 8192, desc: 'Small, good quality (~33MB). General text.', quantized: false, }, 'jina-base': { name: 'Xenova/jina-embeddings-v2-base-en', dim: 768, + contextWindow: 8192, desc: 'Good quality (~137MB). General text, 8192 token context.', quantized: false, }, 'jina-code': { name: 'Xenova/jina-embeddings-v2-base-code', dim: 768, + contextWindow: 8192, desc: 'Code-aware (~137MB). Trained on code+text, best for code search.', quantized: false, }, nomic: { name: 'Xenova/nomic-embed-text-v1', dim: 768, + contextWindow: 8192, desc: 'Good local quality (~137MB). 8192 context.', quantized: false, }, 'nomic-v1.5': { name: 'nomic-ai/nomic-embed-text-v1.5', dim: 768, + contextWindow: 8192, desc: 'Improved nomic (~137MB). Matryoshka dimensions, 8192 context.', quantized: false, }, 'bge-large': { name: 'Xenova/bge-large-en-v1.5', dim: 1024, + contextWindow: 512, desc: 'Best general retrieval (~335MB). Top MTEB scores.', quantized: false, }, }; +export const EMBEDDING_STRATEGIES = ['structured', 'source']; + export const DEFAULT_MODEL = 'minilm'; const BATCH_SIZE_MAP = { minilm: 32, @@ -89,6 +98,108 @@ function getModelConfig(modelKey) { return config; } +/** + * Rough token estimate (~4 chars per token for code/English). + * Conservative — avoids adding a tokenizer dependency. + */ +export function estimateTokens(text) { + return Math.ceil(text.length / 4); +} + +/** + * Extract leading comment text (JSDoc, //, #, etc.) above a function line. + * Returns the cleaned comment text or null if none found. + */ +function extractLeadingComment(lines, fnLineIndex) { + const raw = []; + for (let i = fnLineIndex - 1; i >= Math.max(0, fnLineIndex - 15); i--) { + const trimmed = lines[i].trim(); + if (/^(\/\/|\/\*|\*\/|\*|#|\/\/\/)/.test(trimmed)) { + raw.unshift(trimmed); + } else if (trimmed === '') { + if (raw.length > 0) break; + } else { + break; + } + } + if (raw.length === 0) return null; + return raw + .map((line) => + line + .replace(/^\/\*\*?\s?|\*\/$/g, '') // opening /** or /* and closing */ + .replace(/^\*\s?/, '') // middle * lines + .replace(/^\/\/\/?\s?/, '') // // or /// + .replace(/^#\s?/, '') // # (Python/Ruby) + .trim(), + ) + .filter((l) => l.length > 0) + .join(' '); +} + +/** + * Build graph-enriched text for a symbol using dependency context. + * Produces compact, semantic text (~100 tokens) instead of full source code. + */ +function buildStructuredText(node, file, lines, calleesStmt, callersStmt) { + const readable = splitIdentifier(node.name); + const parts = [`${node.kind} ${node.name} (${readable}) in ${file}`]; + const startLine = Math.max(0, node.line - 1); + + // Extract parameters from signature (best-effort, single-line) + const sigLine = lines[startLine] || ''; + const paramMatch = sigLine.match(/\(([^)]*)\)/); + if (paramMatch?.[1]?.trim()) { + parts.push(`Parameters: ${paramMatch[1].trim()}`); + } + + // Graph context: callees (capped at 10) + const callees = calleesStmt.all(node.id); + if (callees.length > 0) { + parts.push( + `Calls: ${callees + .slice(0, 10) + .map((c) => c.name) + .join(', ')}`, + ); + } + + // Graph context: callers (capped at 10) + const callers = callersStmt.all(node.id); + if (callers.length > 0) { + parts.push( + `Called by: ${callers + .slice(0, 10) + .map((c) => c.name) + .join(', ')}`, + ); + } + + // Leading comment (high semantic value) or first few lines of code + const comment = extractLeadingComment(lines, startLine); + if (comment) { + parts.push(comment); + } else { + const endLine = Math.min(lines.length, startLine + 4); + const snippet = lines.slice(startLine, endLine).join('\n').trim(); + if (snippet) parts.push(snippet); + } + + return parts.join('\n'); +} + +/** + * Build raw source-code text for a symbol (original strategy). + */ +function buildSourceText(node, file, lines) { + const startLine = Math.max(0, node.line - 1); + const endLine = node.end_line + ? Math.min(lines.length, node.end_line) + : Math.min(lines.length, startLine + 15); + const context = lines.slice(startLine, endLine).join('\n'); + const readable = splitIdentifier(node.name); + return `${node.kind} ${node.name} (${readable}) in ${file}\n${context}`; +} + /** * Lazy-load @huggingface/transformers. * This is an optional dependency — gives a clear error if not installed. @@ -203,10 +314,14 @@ function initEmbeddingsSchema(db) { /** * Build embeddings for all functions/methods/classes in the graph. + * @param {string} rootDir - Project root directory + * @param {string} modelKey - Model identifier from MODELS registry + * @param {string} [customDbPath] - Override path to graph.db + * @param {object} [options] - Embedding options + * @param {string} [options.strategy='structured'] - 'structured' (graph-enriched) or 'source' (raw code) */ -export async function buildEmbeddings(rootDir, modelKey, customDbPath) { - // path already imported at top - // fs already imported at top +export async function buildEmbeddings(rootDir, modelKey, customDbPath, options = {}) { + const strategy = options.strategy || 'structured'; const dbPath = customDbPath || findDbPath(null); const db = new Database(dbPath); @@ -221,7 +336,24 @@ export async function buildEmbeddings(rootDir, modelKey, customDbPath) { ) .all(); - console.log(`Building embeddings for ${nodes.length} symbols...`); + console.log(`Building embeddings for ${nodes.length} symbols (strategy: ${strategy})...`); + + // Prepare graph-context queries for structured strategy + let calleesStmt, callersStmt; + if (strategy === 'structured') { + calleesStmt = db.prepare(` + SELECT DISTINCT n.name FROM edges e + JOIN nodes n ON e.target_id = n.id + WHERE e.source_id = ? AND e.kind = 'calls' + ORDER BY n.name + `); + callersStmt = db.prepare(` + SELECT DISTINCT n.name FROM edges e + JOIN nodes n ON e.source_id = n.id + WHERE e.target_id = ? AND e.kind = 'calls' + ORDER BY n.name + `); + } const byFile = new Map(); for (const node of nodes) { @@ -232,6 +364,9 @@ export async function buildEmbeddings(rootDir, modelKey, customDbPath) { const texts = []; const nodeIds = []; const previews = []; + const config = getModelConfig(modelKey); + const contextWindow = config.contextWindow; + let overflowCount = 0; for (const [file, fileNodes] of byFile) { const fullPath = path.join(rootDir, file); @@ -244,20 +379,31 @@ export async function buildEmbeddings(rootDir, modelKey, customDbPath) { } for (const node of fileNodes) { - const startLine = Math.max(0, node.line - 1); - const endLine = node.end_line - ? Math.min(lines.length, node.end_line) - : Math.min(lines.length, startLine + 15); - const context = lines.slice(startLine, endLine).join('\n'); - - const readable = splitIdentifier(node.name); - const text = `${node.kind} ${node.name} (${readable}) in ${file}\n${context}`; + let text = + strategy === 'structured' + ? buildStructuredText(node, file, lines, calleesStmt, callersStmt) + : buildSourceText(node, file, lines); + + // Detect and handle context window overflow + const tokens = estimateTokens(text); + if (tokens > contextWindow) { + overflowCount++; + const maxChars = contextWindow * 4; + text = text.slice(0, maxChars); + } + texts.push(text); nodeIds.push(node.id); previews.push(`${node.name} (${node.kind}) -- ${file}:${node.line}`); } } + if (overflowCount > 0) { + warn( + `${overflowCount} symbol(s) exceeded model context window (${contextWindow} tokens) and were truncated`, + ); + } + console.log(`Embedding ${texts.length} symbols...`); const { vectors, dim } = await embed(texts, modelKey); @@ -269,16 +415,19 @@ export async function buildEmbeddings(rootDir, modelKey, customDbPath) { for (let i = 0; i < vectors.length; i++) { insert.run(nodeIds[i], Buffer.from(vectors[i].buffer), previews[i]); } - const config = getModelConfig(modelKey); insertMeta.run('model', config.name); insertMeta.run('dim', String(dim)); insertMeta.run('count', String(vectors.length)); + insertMeta.run('strategy', strategy); insertMeta.run('built_at', new Date().toISOString()); + if (overflowCount > 0) { + insertMeta.run('truncated_count', String(overflowCount)); + } }); insertAll(); console.log( - `\nStored ${vectors.length} embeddings (${dim}d, ${getModelConfig(modelKey).name}) in graph.db`, + `\nStored ${vectors.length} embeddings (${dim}d, ${config.name}, strategy: ${strategy}) in graph.db`, ); db.close(); } diff --git a/src/index.js b/src/index.js index a0caf3b4d..7435b8a61 100644 --- a/src/index.js +++ b/src/index.js @@ -21,7 +21,9 @@ export { buildEmbeddings, cosineSim, DEFAULT_MODEL, + EMBEDDING_STRATEGIES, embed, + estimateTokens, MODELS, multiSearchData, search, diff --git a/tests/search/embedding-benchmark.js b/tests/search/embedding-benchmark.js new file mode 100644 index 000000000..11dc9aad0 --- /dev/null +++ b/tests/search/embedding-benchmark.js @@ -0,0 +1,124 @@ +#!/usr/bin/env node + +/** + * Embedding strategy benchmark — compares structured vs source strategies + * against real search queries on the current project's graph. + * + * Prerequisites: + * - @huggingface/transformers installed + * - codegraph build already run (graph.db exists) + * + * Usage: + * node tests/search/embedding-benchmark.js + * node tests/search/embedding-benchmark.js --model minilm + */ + +import path from 'node:path'; +import { buildEmbeddings, DEFAULT_MODEL, MODELS, searchData } from '../../src/embedder.js'; + +const model = process.argv.includes('--model') + ? process.argv[process.argv.indexOf('--model') + 1] + : DEFAULT_MODEL; + +const rootDir = '.'; +const dbPath = path.resolve('.codegraph/graph.db'); + +// Queries with expected best-match symbol name +const QUERIES = [ + { q: 'parse source code with tree-sitter', expect: 'parseFilesAuto' }, + { q: 'find circular dependencies', expect: 'findCycles' }, + { q: 'build dependency graph from source files', expect: 'buildGraph' }, + { q: 'resolve import path to actual file', expect: 'resolveImportPath' }, + { q: 'cosine similarity between vectors', expect: 'cosineSim' }, + { q: 'export graph as DOT format', expect: 'exportDOT' }, + { q: 'semantic search with embeddings', expect: 'search' }, + { q: 'incremental file hashing', expect: 'hashFile' }, + { q: 'load configuration from file', expect: 'loadConfig' }, + { q: 'extract functions and classes from code', expect: 'extractJavaScript' }, + { q: 'impact analysis of code changes', expect: 'diffImpactData' }, + { q: 'start MCP server for AI agents', expect: 'startMCPServer' }, + { q: 'watch files for changes', expect: 'watchProject' }, + { q: 'reciprocal rank fusion for multi-query search', expect: 'multiSearchData' }, +]; + +async function benchmark(strategy) { + await buildEmbeddings(rootDir, model, dbPath, { strategy }); + + let hits1 = 0; + let hits3 = 0; + let hits5 = 0; + const details = []; + + for (const { q, expect: expected } of QUERIES) { + const data = await searchData(q, dbPath, { minScore: 0.01, limit: 10 }); + if (!data) continue; + + const names = data.results.map((r) => r.name); + const rank = names.indexOf(expected) + 1; // 0 = not found + if (rank === 1) hits1++; + if (rank >= 1 && rank <= 3) hits3++; + if (rank >= 1 && rank <= 5) hits5++; + + const matchScore = rank > 0 ? data.results[rank - 1].similarity.toFixed(3) : 'miss'; + details.push({ + q: q.slice(0, 50), + expected, + rank: rank || '>10', + actual: names[0], + matchScore, + }); + } + + return { strategy, hits1, hits3, hits5, total: QUERIES.length, details }; +} + +const modelConfig = MODELS[model]; +console.log('=== Embedding Strategy Benchmark ==='); +console.log(`Model: ${model} (${modelConfig.dim}d, ${modelConfig.contextWindow} token context)`); +console.log(`Queries: ${QUERIES.length}`); +console.log(''); + +const structured = await benchmark('structured'); +const source = await benchmark('source'); + +// Summary table +console.log(''); +console.log('=== RESULTS ==='); +console.log(''); +console.log(`${'Metric'.padEnd(12)}${'structured'.padEnd(16)}${'source'.padEnd(16)}delta`); +for (const [label, key] of [ + ['Hit@1', 'hits1'], + ['Hit@3', 'hits3'], + ['Hit@5', 'hits5'], +]) { + const s = structured[key]; + const o = source[key]; + const sp = `${s}/${structured.total} (${((s / structured.total) * 100).toFixed(0)}%)`; + const op = `${o}/${source.total} (${((o / source.total) * 100).toFixed(0)}%)`; + const delta = s - o; + const sign = delta > 0 ? '+' : ''; + console.log(`${label.padEnd(12)}${sp.padEnd(16)}${op.padEnd(16)}${sign}${delta}`); +} + +// Per-query comparison +console.log(''); +console.log(`${'Query'.padEnd(52)}${'Expected'.padEnd(22)}Struct Source`); +for (let i = 0; i < QUERIES.length; i++) { + const s = structured.details[i]; + const o = source.details[i]; + const sw = + typeof s.rank === 'number' && (typeof o.rank !== 'number' || s.rank < o.rank) ? '*' : ' '; + const ow = + typeof o.rank === 'number' && (typeof s.rank !== 'number' || o.rank < s.rank) ? '*' : ' '; + console.log( + s.q.padEnd(52) + + s.expected.padEnd(22) + + String(s.rank).padEnd(4) + + sw + + ' ' + + String(o.rank).padEnd(4) + + ow, + ); +} +console.log(''); +console.log('* = better rank for that query'); diff --git a/tests/search/embedding-strategy.test.js b/tests/search/embedding-strategy.test.js new file mode 100644 index 000000000..5db82bb9d --- /dev/null +++ b/tests/search/embedding-strategy.test.js @@ -0,0 +1,306 @@ +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import Database from 'better-sqlite3'; +import { afterAll, beforeAll, describe, expect, test, vi } from 'vitest'; +import { initSchema } from '../../src/db.js'; + +// ─── Mock setup ──────────────────────────────────────────────────────── + +// Capture texts passed to the embedding model +const { EMBEDDED_TEXTS } = vi.hoisted(() => ({ + EMBEDDED_TEXTS: [], +})); + +vi.mock('@huggingface/transformers', () => ({ + pipeline: async () => async (batch) => { + const dim = 384; + const data = new Float32Array(dim * batch.length); + for (let t = 0; t < batch.length; t++) { + EMBEDDED_TEXTS.push(batch[t]); + data[t * dim] = 0.5; + data[t * dim + 1] = 0.3; + } + return { data }; + }, + cos_sim: () => 0, +})); + +import { + buildEmbeddings, + EMBEDDING_STRATEGIES, + estimateTokens, + MODELS, +} from '../../src/embedder.js'; + +// ─── Helpers ─────────────────────────────────────────────────────────── + +function insertNode(db, name, kind, file, line, endLine) { + return db + .prepare('INSERT INTO nodes (name, kind, file, line, end_line) VALUES (?, ?, ?, ?, ?)') + .run(name, kind, file, line, endLine).lastInsertRowid; +} + +function insertEdge(db, sourceId, targetId, kind) { + db.prepare('INSERT INTO edges (source_id, target_id, kind) VALUES (?, ?, ?)').run( + sourceId, + targetId, + kind, + ); +} + +// ─── Fixture ─────────────────────────────────────────────────────────── + +// Source files that match the DB nodes +const FIXTURE_FILES = { + 'math.js': [ + '/**', + ' * Add two numbers together.', + ' */', + 'export function add(a, b) { return a + b; }', + 'export function multiply(a, b) { return a * b; }', + 'export function square(x) { return multiply(x, x); }', + ].join('\n'), + 'utils.js': [ + "import { add, square } from './math.js';", + 'export function sumOfSquares(a, b) { return add(square(a), square(b)); }', + 'export class Calculator {', + ' compute(x, y) { return sumOfSquares(x, y); }', + '}', + ].join('\n'), +}; + +let tmpDir, dbPath; + +beforeAll(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-strategy-test-')); + + // Write source files + for (const [name, content] of Object.entries(FIXTURE_FILES)) { + fs.writeFileSync(path.join(tmpDir, name), content); + } + + // Create DB with nodes + edges + const dbDir = path.join(tmpDir, '.codegraph'); + fs.mkdirSync(dbDir, { recursive: true }); + dbPath = path.join(dbDir, 'graph.db'); + + const db = new Database(dbPath); + db.pragma('journal_mode = WAL'); + initSchema(db); + + // math.js nodes (line numbers are 1-indexed) + const addId = insertNode(db, 'add', 'function', 'math.js', 4, 4); + const multiplyId = insertNode(db, 'multiply', 'function', 'math.js', 5, 5); + const squareId = insertNode(db, 'square', 'function', 'math.js', 6, 6); + + // utils.js nodes + const sumOfSquaresId = insertNode(db, 'sumOfSquares', 'function', 'utils.js', 2, 2); + insertNode(db, 'Calculator', 'class', 'utils.js', 3, 5); + const computeId = insertNode(db, 'compute', 'method', 'utils.js', 4, 4); + + // Call edges: square → multiply, sumOfSquares → add, sumOfSquares → square, compute → sumOfSquares + insertEdge(db, squareId, multiplyId, 'calls'); + insertEdge(db, sumOfSquaresId, addId, 'calls'); + insertEdge(db, sumOfSquaresId, squareId, 'calls'); + insertEdge(db, computeId, sumOfSquaresId, 'calls'); + + db.close(); +}); + +afterAll(() => { + if (tmpDir) fs.rmSync(tmpDir, { recursive: true, force: true }); +}); + +// ─── Tests ───────────────────────────────────────────────────────────── + +describe('EMBEDDING_STRATEGIES', () => { + test('exports valid strategies', () => { + expect(EMBEDDING_STRATEGIES).toContain('structured'); + expect(EMBEDDING_STRATEGIES).toContain('source'); + }); +}); + +describe('estimateTokens', () => { + test('estimates ~4 chars per token', () => { + expect(estimateTokens('abcd')).toBe(1); + expect(estimateTokens('abcdefgh')).toBe(2); + expect(estimateTokens('a'.repeat(100))).toBe(25); + }); + + test('rounds up', () => { + expect(estimateTokens('abcde')).toBe(2); + }); + + test('handles empty string', () => { + expect(estimateTokens('')).toBe(0); + }); +}); + +describe('MODELS contextWindow', () => { + test('every model has a contextWindow', () => { + for (const [key, config] of Object.entries(MODELS)) { + expect(config.contextWindow, `${key} missing contextWindow`).toBeGreaterThan(0); + } + }); +}); + +describe('buildEmbeddings with structured strategy', () => { + test('produces embeddings with graph context', async () => { + EMBEDDED_TEXTS.length = 0; + await buildEmbeddings(tmpDir, 'minilm', dbPath, { strategy: 'structured' }); + + expect(EMBEDDED_TEXTS.length).toBeGreaterThan(0); + + // square calls multiply → should appear in structured text + const squareText = EMBEDDED_TEXTS.find((t) => t.startsWith('function square')); + expect(squareText).toBeDefined(); + expect(squareText).toContain('Calls:'); + expect(squareText).toContain('multiply'); + + // sumOfSquares calls add and square → should appear + const sosText = EMBEDDED_TEXTS.find((t) => t.startsWith('function sumOfSquares')); + expect(sosText).toBeDefined(); + expect(sosText).toContain('Calls:'); + expect(sosText).toContain('add'); + expect(sosText).toContain('square'); + + // sumOfSquares is called by compute → should appear + expect(sosText).toContain('Called by:'); + expect(sosText).toContain('compute'); + }); + + test('extracts leading comments', async () => { + // add has a JSDoc comment above it: "Add two numbers together." + const addText = EMBEDDED_TEXTS.find((t) => t.startsWith('function add')); + expect(addText).toBeDefined(); + expect(addText).toContain('Add two numbers together'); + }); + + test('extracts parameters from signature', async () => { + const addText = EMBEDDED_TEXTS.find((t) => t.startsWith('function add')); + expect(addText).toBeDefined(); + expect(addText).toContain('Parameters:'); + expect(addText).toContain('a, b'); + }); + + test('stores strategy in metadata', async () => { + const db = new Database(dbPath, { readonly: true }); + const row = db.prepare("SELECT value FROM embedding_meta WHERE key = 'strategy'").get(); + db.close(); + expect(row.value).toBe('structured'); + }); + + test('structured texts are compact', () => { + for (const text of EMBEDDED_TEXTS) { + const tokens = estimateTokens(text); + expect(tokens).toBeLessThan(200); + } + }); +}); + +describe('buildEmbeddings with source strategy', () => { + test('produces embeddings with raw source code', async () => { + EMBEDDED_TEXTS.length = 0; + await buildEmbeddings(tmpDir, 'minilm', dbPath, { strategy: 'source' }); + + expect(EMBEDDED_TEXTS.length).toBeGreaterThan(0); + + // Source strategy should NOT have graph context lines + const squareText = EMBEDDED_TEXTS.find((t) => t.startsWith('function square')); + expect(squareText).toBeDefined(); + expect(squareText).not.toContain('Calls:'); + expect(squareText).not.toContain('Called by:'); + expect(squareText).toContain('return'); + }); + + test('stores strategy in metadata', async () => { + const db = new Database(dbPath, { readonly: true }); + const row = db.prepare("SELECT value FROM embedding_meta WHERE key = 'strategy'").get(); + db.close(); + expect(row.value).toBe('source'); + }); +}); + +describe('buildEmbeddings defaults to structured', () => { + test('no options → structured strategy', async () => { + EMBEDDED_TEXTS.length = 0; + await buildEmbeddings(tmpDir, 'minilm', dbPath); + + const db = new Database(dbPath, { readonly: true }); + const row = db.prepare("SELECT value FROM embedding_meta WHERE key = 'strategy'").get(); + db.close(); + expect(row.value).toBe('structured'); + }); +}); + +describe('context window overflow detection', () => { + let bigDir, bigDbPath; + + beforeAll(() => { + // Create a file with a very large function that will overflow minilm's 256-token window + bigDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-overflow-test-')); + const bigFn = + 'export function bigFunction(x) {\n' + + ' const data = [];\n'.repeat(400) + + ' return data;\n}\n'; + fs.writeFileSync(path.join(bigDir, 'big.js'), bigFn); + + const bigDbDir = path.join(bigDir, '.codegraph'); + fs.mkdirSync(bigDbDir, { recursive: true }); + bigDbPath = path.join(bigDbDir, 'graph.db'); + + const db = new Database(bigDbPath); + db.pragma('journal_mode = WAL'); + initSchema(db); + insertNode(db, 'bigFunction', 'function', 'big.js', 1, 403); + db.close(); + }); + + afterAll(() => { + if (bigDir) fs.rmSync(bigDir, { recursive: true, force: true }); + }); + + test('warns and truncates when source text exceeds context window', async () => { + const warnSpy = vi.spyOn(process.stderr, 'write').mockImplementation(() => true); + + EMBEDDED_TEXTS.length = 0; + await buildEmbeddings(bigDir, 'minilm', bigDbPath, { strategy: 'source' }); + + const warnOutput = warnSpy.mock.calls.map((c) => c[0]).join(''); + warnSpy.mockRestore(); + + expect(warnOutput).toContain('exceeded model context window'); + expect(warnOutput).toContain('truncated'); + + // Text should be truncated to fit minilm's 256-token ≈ 1024 char limit + const bigText = EMBEDDED_TEXTS.find((t) => t.includes('bigFunction')); + expect(bigText).toBeDefined(); + expect(bigText.length).toBeLessThanOrEqual(256 * 4); + + // Metadata records truncation count + const db = new Database(bigDbPath, { readonly: true }); + const row = db.prepare("SELECT value FROM embedding_meta WHERE key = 'truncated_count'").get(); + db.close(); + expect(row).toBeDefined(); + expect(Number(row.value)).toBeGreaterThan(0); + }); + + test('structured strategy avoids overflow for same function', async () => { + const warnSpy = vi.spyOn(process.stderr, 'write').mockImplementation(() => true); + + EMBEDDED_TEXTS.length = 0; + await buildEmbeddings(bigDir, 'minilm', bigDbPath, { strategy: 'structured' }); + + const warnOutput = warnSpy.mock.calls.map((c) => c[0]).join(''); + warnSpy.mockRestore(); + + // Structured strategy only uses first few lines + graph context → should NOT overflow + const bigText = EMBEDDED_TEXTS.find((t) => t.includes('bigFunction')); + expect(bigText).toBeDefined(); + expect(estimateTokens(bigText)).toBeLessThan(256); + + // No truncation warning expected + expect(warnOutput).not.toContain('exceeded model context window'); + }); +});