Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 18 additions & 4 deletions src/cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import { buildGraph } from './builder.js';
import { loadConfig } from './config.js';
import { findCycles, formatCycles } from './cycles.js';
import { findDbPath } from './db.js';
import { buildEmbeddings, MODELS, search } from './embedder.js';
import { buildEmbeddings, EMBEDDING_STRATEGIES, MODELS, search } from './embedder.js';
import { exportDOT, exportJSON, exportMermaid } from './export.js';
import { setVerbose } from './logger.js';
import {
Expand Down Expand Up @@ -418,9 +418,12 @@ program
console.log('\nAvailable embedding models:\n');
for (const [key, config] of Object.entries(MODELS)) {
const def = key === 'minilm' ? ' (default)' : '';
console.log(` ${key.padEnd(12)} ${String(config.dim).padStart(4)}d ${config.desc}${def}`);
const ctx = config.contextWindow ? `${config.contextWindow} ctx` : '';
console.log(
` ${key.padEnd(12)} ${String(config.dim).padStart(4)}d ${ctx.padEnd(9)} ${config.desc}${def}`,
);
}
console.log('\nUsage: codegraph embed --model <name>');
console.log('\nUsage: codegraph embed --model <name> --strategy <structured|source>');
console.log(' codegraph search "query" --model <name>\n');
});

Expand All @@ -434,9 +437,20 @@ program
'Embedding model: minilm (default), jina-small, jina-base, jina-code, nomic, nomic-v1.5, bge-large. Run `codegraph models` for details',
'minilm',
)
.option(
'-s, --strategy <name>',
`Embedding strategy: ${EMBEDDING_STRATEGIES.join(', ')}. "structured" uses graph context (callers/callees), "source" embeds raw code`,
'structured',
)
.action(async (dir, opts) => {
if (!EMBEDDING_STRATEGIES.includes(opts.strategy)) {
console.error(
`Unknown strategy: ${opts.strategy}. Available: ${EMBEDDING_STRATEGIES.join(', ')}`,
);
process.exit(1);
}
const root = path.resolve(dir || '.');
await buildEmbeddings(root, opts.model);
await buildEmbeddings(root, opts.model, undefined, { strategy: opts.strategy });
});

program
Expand Down
177 changes: 163 additions & 14 deletions src/embedder.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,47 +26,56 @@ export const MODELS = {
minilm: {
name: 'Xenova/all-MiniLM-L6-v2',
dim: 384,
contextWindow: 256,
desc: 'Smallest, fastest (~23MB). General text.',
quantized: true,
},
'jina-small': {
name: 'Xenova/jina-embeddings-v2-small-en',
dim: 512,
contextWindow: 8192,
desc: 'Small, good quality (~33MB). General text.',
quantized: false,
},
'jina-base': {
name: 'Xenova/jina-embeddings-v2-base-en',
dim: 768,
contextWindow: 8192,
desc: 'Good quality (~137MB). General text, 8192 token context.',
quantized: false,
},
'jina-code': {
name: 'Xenova/jina-embeddings-v2-base-code',
dim: 768,
contextWindow: 8192,
desc: 'Code-aware (~137MB). Trained on code+text, best for code search.',
quantized: false,
},
nomic: {
name: 'Xenova/nomic-embed-text-v1',
dim: 768,
contextWindow: 8192,
desc: 'Good local quality (~137MB). 8192 context.',
quantized: false,
},
'nomic-v1.5': {
name: 'nomic-ai/nomic-embed-text-v1.5',
dim: 768,
contextWindow: 8192,
desc: 'Improved nomic (~137MB). Matryoshka dimensions, 8192 context.',
quantized: false,
},
'bge-large': {
name: 'Xenova/bge-large-en-v1.5',
dim: 1024,
contextWindow: 512,
desc: 'Best general retrieval (~335MB). Top MTEB scores.',
quantized: false,
},
};

export const EMBEDDING_STRATEGIES = ['structured', 'source'];

export const DEFAULT_MODEL = 'minilm';
const BATCH_SIZE_MAP = {
minilm: 32,
Expand All @@ -89,6 +98,108 @@ function getModelConfig(modelKey) {
return config;
}

/**
* Rough token estimate (~4 chars per token for code/English).
* Conservative — avoids adding a tokenizer dependency.
*/
export function estimateTokens(text) {
return Math.ceil(text.length / 4);
}

/**
* Extract leading comment text (JSDoc, //, #, etc.) above a function line.
* Returns the cleaned comment text or null if none found.
*/
function extractLeadingComment(lines, fnLineIndex) {
const raw = [];
for (let i = fnLineIndex - 1; i >= Math.max(0, fnLineIndex - 15); i--) {
const trimmed = lines[i].trim();
if (/^(\/\/|\/\*|\*\/|\*|#|\/\/\/)/.test(trimmed)) {
raw.unshift(trimmed);
} else if (trimmed === '') {
if (raw.length > 0) break;
} else {
break;
}
}
if (raw.length === 0) return null;
return raw
.map((line) =>
line
.replace(/^\/\*\*?\s?|\*\/$/g, '') // opening /** or /* and closing */
.replace(/^\*\s?/, '') // middle * lines
.replace(/^\/\/\/?\s?/, '') // // or ///
.replace(/^#\s?/, '') // # (Python/Ruby)
.trim(),
)
.filter((l) => l.length > 0)
.join(' ');
}

/**
* Build graph-enriched text for a symbol using dependency context.
* Produces compact, semantic text (~100 tokens) instead of full source code.
*/
function buildStructuredText(node, file, lines, calleesStmt, callersStmt) {
const readable = splitIdentifier(node.name);
const parts = [`${node.kind} ${node.name} (${readable}) in ${file}`];
const startLine = Math.max(0, node.line - 1);

// Extract parameters from signature (best-effort, single-line)
const sigLine = lines[startLine] || '';
const paramMatch = sigLine.match(/\(([^)]*)\)/);
if (paramMatch?.[1]?.trim()) {
parts.push(`Parameters: ${paramMatch[1].trim()}`);
}

// Graph context: callees (capped at 10)
const callees = calleesStmt.all(node.id);
if (callees.length > 0) {
parts.push(
`Calls: ${callees
.slice(0, 10)
.map((c) => c.name)
.join(', ')}`,
);
}

// Graph context: callers (capped at 10)
const callers = callersStmt.all(node.id);
if (callers.length > 0) {
parts.push(
`Called by: ${callers
.slice(0, 10)
.map((c) => c.name)
.join(', ')}`,
);
}

// Leading comment (high semantic value) or first few lines of code
const comment = extractLeadingComment(lines, startLine);
if (comment) {
parts.push(comment);
} else {
const endLine = Math.min(lines.length, startLine + 4);
const snippet = lines.slice(startLine, endLine).join('\n').trim();
if (snippet) parts.push(snippet);
}

return parts.join('\n');
}

/**
* Build raw source-code text for a symbol (original strategy).
*/
function buildSourceText(node, file, lines) {
const startLine = Math.max(0, node.line - 1);
const endLine = node.end_line
? Math.min(lines.length, node.end_line)
: Math.min(lines.length, startLine + 15);
const context = lines.slice(startLine, endLine).join('\n');
const readable = splitIdentifier(node.name);
return `${node.kind} ${node.name} (${readable}) in ${file}\n${context}`;
}

/**
* Lazy-load @huggingface/transformers.
* This is an optional dependency — gives a clear error if not installed.
Expand Down Expand Up @@ -203,10 +314,14 @@ function initEmbeddingsSchema(db) {

/**
* Build embeddings for all functions/methods/classes in the graph.
* @param {string} rootDir - Project root directory
* @param {string} modelKey - Model identifier from MODELS registry
* @param {string} [customDbPath] - Override path to graph.db
* @param {object} [options] - Embedding options
* @param {string} [options.strategy='structured'] - 'structured' (graph-enriched) or 'source' (raw code)
*/
export async function buildEmbeddings(rootDir, modelKey, customDbPath) {
// path already imported at top
// fs already imported at top
export async function buildEmbeddings(rootDir, modelKey, customDbPath, options = {}) {
const strategy = options.strategy || 'structured';
const dbPath = customDbPath || findDbPath(null);

const db = new Database(dbPath);
Expand All @@ -221,7 +336,24 @@ export async function buildEmbeddings(rootDir, modelKey, customDbPath) {
)
.all();

console.log(`Building embeddings for ${nodes.length} symbols...`);
console.log(`Building embeddings for ${nodes.length} symbols (strategy: ${strategy})...`);

// Prepare graph-context queries for structured strategy
let calleesStmt, callersStmt;
if (strategy === 'structured') {
calleesStmt = db.prepare(`
SELECT DISTINCT n.name FROM edges e
JOIN nodes n ON e.target_id = n.id
WHERE e.source_id = ? AND e.kind = 'calls'
ORDER BY n.name
`);
callersStmt = db.prepare(`
SELECT DISTINCT n.name FROM edges e
JOIN nodes n ON e.source_id = n.id
WHERE e.target_id = ? AND e.kind = 'calls'
ORDER BY n.name
`);
}

const byFile = new Map();
for (const node of nodes) {
Expand All @@ -232,6 +364,9 @@ export async function buildEmbeddings(rootDir, modelKey, customDbPath) {
const texts = [];
const nodeIds = [];
const previews = [];
const config = getModelConfig(modelKey);
const contextWindow = config.contextWindow;
let overflowCount = 0;

for (const [file, fileNodes] of byFile) {
const fullPath = path.join(rootDir, file);
Expand All @@ -244,20 +379,31 @@ export async function buildEmbeddings(rootDir, modelKey, customDbPath) {
}

for (const node of fileNodes) {
const startLine = Math.max(0, node.line - 1);
const endLine = node.end_line
? Math.min(lines.length, node.end_line)
: Math.min(lines.length, startLine + 15);
const context = lines.slice(startLine, endLine).join('\n');

const readable = splitIdentifier(node.name);
const text = `${node.kind} ${node.name} (${readable}) in ${file}\n${context}`;
let text =
strategy === 'structured'
? buildStructuredText(node, file, lines, calleesStmt, callersStmt)
: buildSourceText(node, file, lines);

// Detect and handle context window overflow
const tokens = estimateTokens(text);
if (tokens > contextWindow) {
overflowCount++;
const maxChars = contextWindow * 4;
text = text.slice(0, maxChars);
}

texts.push(text);
nodeIds.push(node.id);
previews.push(`${node.name} (${node.kind}) -- ${file}:${node.line}`);
}
}

if (overflowCount > 0) {
warn(
`${overflowCount} symbol(s) exceeded model context window (${contextWindow} tokens) and were truncated`,
);
}

console.log(`Embedding ${texts.length} symbols...`);
const { vectors, dim } = await embed(texts, modelKey);

Expand All @@ -269,16 +415,19 @@ export async function buildEmbeddings(rootDir, modelKey, customDbPath) {
for (let i = 0; i < vectors.length; i++) {
insert.run(nodeIds[i], Buffer.from(vectors[i].buffer), previews[i]);
}
const config = getModelConfig(modelKey);
insertMeta.run('model', config.name);
insertMeta.run('dim', String(dim));
insertMeta.run('count', String(vectors.length));
insertMeta.run('strategy', strategy);
insertMeta.run('built_at', new Date().toISOString());
if (overflowCount > 0) {
insertMeta.run('truncated_count', String(overflowCount));
}
});
insertAll();

console.log(
`\nStored ${vectors.length} embeddings (${dim}d, ${getModelConfig(modelKey).name}) in graph.db`,
`\nStored ${vectors.length} embeddings (${dim}d, ${config.name}, strategy: ${strategy}) in graph.db`,
);
db.close();
}
Expand Down
2 changes: 2 additions & 0 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ export {
buildEmbeddings,
cosineSim,
DEFAULT_MODEL,
EMBEDDING_STRATEGIES,
embed,
estimateTokens,
MODELS,
multiSearchData,
search,
Expand Down
Loading