Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions generated/DOGFOOD_REPORT_v2.2.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@

**Fix:** Treat `.` (or current dir equivalent) as `null`/no filter in `structureData()`.

> **FIXED** — `structureData()` now normalizes the directory argument and treats `"."` as null/no filter. (`src/structure.js`)

### 2. Stale embeddings after rebuild (Medium severity)

- After an incremental `build`, embedding `node_id`s become orphaned (e.g. old IDs in 3077-range, new IDs in 4335-range)
Expand All @@ -52,6 +54,8 @@

**Fix:** Either preserve node IDs across rebuilds, invalidate embeddings when node IDs change, or warn the user to re-run `embed`.

> **FIXED** — Build now invalidates embeddings alongside nodes. Full builds clear the embeddings table entirely. Incremental builds delete embeddings for affected files before deleting their nodes (order matters — need node IDs to find them). After the build, any remaining orphaned embeddings trigger a warning: `"N embeddings are orphaned (nodes changed). Run codegraph embed to refresh."` (`src/builder.js`)

### 3. `embed` default model requires HuggingFace auth (Medium severity)

- `codegraph embed .` crashes with `Error: Unauthorized access to file` for the default `jina-code` model
Expand All @@ -61,6 +65,8 @@

**Fix:** Either default to a public model (e.g. `minilm`), auto-fallback to `minilm` on auth failure, or catch the error and provide a clear message with instructions.

> **FIXED** — Default model changed from `nomic-v1.5` (gated, requires HF_TOKEN) to `minilm` (public, 23MB, always works). Additionally, `loadModel()` now catches auth/download failures and prints a clear message with options (set HF_TOKEN or use `--model minilm`) instead of crashing with a raw stack trace. (`src/embedder.js`, `src/cli.js`)

### 4. Cross-language false positive in export (Low severity)

- One low-confidence (0.3) call edge: `main` (build.rs) → `setup` (tests/unit/structure.test.js)
Expand All @@ -69,6 +75,8 @@

**Fix:** Export commands could support a `--min-confidence` filter, or the default export could exclude edges below a threshold (e.g. 0.5).

> **FIXED** — Added `--min-confidence <score>` option to the `export` command (default: 0.5). All three formats (DOT, Mermaid, JSON) filter edges by confidence at the SQL level. The 0.3-confidence false positive is excluded by default. Users can pass `--min-confidence 0` to include all edges. (`src/export.js`, `src/cli.js`)

## `--no-tests` Flag

Tested on `stats` and `map` — both correctly filter out test files:
Expand All @@ -80,3 +88,5 @@ Tested on `stats` and `map` — both correctly filter out test files:
- `embed --model minilm` successfully generated 392 embeddings (384d)
- `search "build graph"` returned 15 results after fresh embeddings (top hit: 37.9% `test_triangle_cycle`)
- Search quality is reasonable but not ideal — `buildGraph` itself didn't appear in results for "build graph"

> **FIXED** — Embedding text now includes a readable split of the identifier name (e.g. `buildGraph` → `"function buildGraph (build Graph) in src/builder.js"`). This lets the model naturally associate "build graph" queries with `buildGraph` without needing hybrid search. camelCase, PascalCase, snake_case, and kebab-case are all handled. (`src/embedder.js`)
40 changes: 38 additions & 2 deletions src/builder.js
Original file line number Diff line number Diff line change
Expand Up @@ -402,13 +402,30 @@ export async function buildGraph(rootDir, opts = {}) {
return;
}

// Check if embeddings table exists (created by `embed`, not by initSchema)
let hasEmbeddings = false;
try {
db.prepare('SELECT 1 FROM embeddings LIMIT 1').get();
hasEmbeddings = true;
} catch {
/* table doesn't exist */
}

if (isFullBuild) {
const deletions =
'PRAGMA foreign_keys = OFF; DELETE FROM node_metrics; DELETE FROM edges; DELETE FROM nodes; PRAGMA foreign_keys = ON;';
db.exec(
'PRAGMA foreign_keys = OFF; DELETE FROM node_metrics; DELETE FROM edges; DELETE FROM nodes; PRAGMA foreign_keys = ON;',
hasEmbeddings
? `${deletions.replace('PRAGMA foreign_keys = ON;', '')} DELETE FROM embeddings; PRAGMA foreign_keys = ON;`
: deletions,
);
} else {
info(`Incremental: ${parseChanges.length} changed, ${removed.length} removed`);
// Remove metrics/edges/nodes for changed and removed files
// Remove embeddings/metrics/edges/nodes for changed and removed files
// Embeddings must be deleted BEFORE nodes (we need node IDs to find them)
const deleteEmbeddingsForFile = hasEmbeddings
? db.prepare('DELETE FROM embeddings WHERE node_id IN (SELECT id FROM nodes WHERE file = ?)')
: null;
const deleteNodesForFile = db.prepare('DELETE FROM nodes WHERE file = ?');
const deleteEdgesForFile = db.prepare(`
DELETE FROM edges WHERE source_id IN (SELECT id FROM nodes WHERE file = @f)
Expand All @@ -418,12 +435,14 @@ export async function buildGraph(rootDir, opts = {}) {
'DELETE FROM node_metrics WHERE node_id IN (SELECT id FROM nodes WHERE file = ?)',
);
for (const relPath of removed) {
deleteEmbeddingsForFile?.run(relPath);
deleteEdgesForFile.run({ f: relPath });
deleteMetricsForFile.run(relPath);
deleteNodesForFile.run(relPath);
}
for (const item of parseChanges) {
const relPath = item.relPath || normalizePath(path.relative(rootDir, item.file));
deleteEmbeddingsForFile?.run(relPath);
deleteEdgesForFile.run({ f: relPath });
deleteMetricsForFile.run(relPath);
deleteNodesForFile.run(relPath);
Expand Down Expand Up @@ -823,6 +842,23 @@ export async function buildGraph(rootDir, opts = {}) {
const nodeCount = db.prepare('SELECT COUNT(*) as c FROM nodes').get().c;
info(`Graph built: ${nodeCount} nodes, ${edgeCount} edges`);
info(`Stored in ${dbPath}`);

// Warn about orphaned embeddings that no longer match any node
if (hasEmbeddings) {
try {
const orphaned = db
.prepare('SELECT COUNT(*) as c FROM embeddings WHERE node_id NOT IN (SELECT id FROM nodes)')
.get().c;
if (orphaned > 0) {
warn(
`${orphaned} embeddings are orphaned (nodes changed). Run "codegraph embed" to refresh.`,
);
}
} catch {
/* ignore — embeddings table may have been dropped */
}
}

db.close();

// Write journal header after successful build
Expand Down
13 changes: 9 additions & 4 deletions src/cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -272,10 +272,15 @@ program
.option('--functions', 'Function-level graph instead of file-level')
.option('-T, --no-tests', 'Exclude test/spec files')
.option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
.option('--min-confidence <score>', 'Minimum edge confidence threshold (default: 0.5)', '0.5')
.option('-o, --output <file>', 'Write to file instead of stdout')
.action((opts) => {
const db = new Database(findDbPath(opts.db), { readonly: true });
const exportOpts = { fileLevel: !opts.functions, noTests: resolveNoTests(opts) };
const exportOpts = {
fileLevel: !opts.functions,
noTests: resolveNoTests(opts),
minConfidence: parseFloat(opts.minConfidence),
};

let output;
switch (opts.format) {
Expand Down Expand Up @@ -412,7 +417,7 @@ program
.action(() => {
console.log('\nAvailable embedding models:\n');
for (const [key, config] of Object.entries(MODELS)) {
const def = key === 'nomic-v1.5' ? ' (default)' : '';
const def = key === 'minilm' ? ' (default)' : '';
console.log(` ${key.padEnd(12)} ${String(config.dim).padStart(4)}d ${config.desc}${def}`);
}
console.log('\nUsage: codegraph embed --model <name>');
Expand All @@ -426,8 +431,8 @@ program
)
.option(
'-m, --model <name>',
'Embedding model: minilm, jina-small, jina-base, jina-code, nomic, nomic-v1.5 (default), bge-large. Run `codegraph models` for details',
'nomic-v1.5',
'Embedding model: minilm (default), jina-small, jina-base, jina-code, nomic, nomic-v1.5, bge-large. Run `codegraph models` for details',
'minilm',
)
.action(async (dir, opts) => {
const root = path.resolve(dir || '.');
Expand Down
40 changes: 36 additions & 4 deletions src/embedder.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@ import Database from 'better-sqlite3';
import { findDbPath, openReadonlyOrFail } from './db.js';
import { warn } from './logger.js';

/**
* Split an identifier into readable words.
* camelCase/PascalCase → "camel Case", snake_case → "snake case", kebab-case → "kebab case"
*/
function splitIdentifier(name) {
return name
.replace(/([a-z])([A-Z])/g, '$1 $2')
.replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
.replace(/[_-]+/g, ' ')
.trim();
}

// Lazy-load transformers (heavy, optional module)
let pipeline = null;
let _cos_sim = null;
Expand Down Expand Up @@ -55,7 +67,7 @@ export const MODELS = {
},
};

export const DEFAULT_MODEL = 'nomic-v1.5';
export const DEFAULT_MODEL = 'minilm';
const BATCH_SIZE_MAP = {
minilm: 32,
'jina-small': 16,
Expand Down Expand Up @@ -103,8 +115,27 @@ async function loadModel(modelKey) {
_cos_sim = transformers.cos_sim;

console.log(`Loading embedding model: ${config.name} (${config.dim}d)...`);
const opts = config.quantized ? { quantized: true } : {};
extractor = await pipeline('feature-extraction', config.name, opts);
const pipelineOpts = config.quantized ? { quantized: true } : {};
try {
extractor = await pipeline('feature-extraction', config.name, pipelineOpts);
} catch (err) {
const msg = err.message || String(err);
if (msg.includes('Unauthorized') || msg.includes('401') || msg.includes('gated')) {
console.error(
`\nModel "${config.name}" requires authentication.\n` +
`This model is gated on HuggingFace and needs an access token.\n\n` +
`Options:\n` +
` 1. Set HF_TOKEN env var: export HF_TOKEN=hf_...\n` +
` 2. Use a public model instead: codegraph embed --model minilm\n`,
);
} else {
console.error(
`\nFailed to load model "${config.name}": ${msg}\n` +
`Try a different model: codegraph embed --model minilm\n`,
);
}
process.exit(1);
}
activeModel = config.name;
console.log('Model loaded.');
return { extractor, config };
Expand Down Expand Up @@ -219,7 +250,8 @@ export async function buildEmbeddings(rootDir, modelKey, customDbPath) {
: Math.min(lines.length, startLine + 15);
const context = lines.slice(startLine, endLine).join('\n');

const text = `${node.kind} ${node.name} in ${file}\n${context}`;
const readable = splitIdentifier(node.name);
const text = `${node.kind} ${node.name} (${readable}) in ${file}\n${context}`;
texts.push(text);
nodeIds.push(node.id);
previews.push(`${node.name} (${node.kind}) -- ${file}:${node.line}`);
Expand Down
23 changes: 16 additions & 7 deletions src/export.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import path from 'node:path';
import { isTestFile } from './queries.js';

const DEFAULT_MIN_CONFIDENCE = 0.5;

/**
* Export the dependency graph in DOT (Graphviz) format.
*/
export function exportDOT(db, opts = {}) {
const fileLevel = opts.fileLevel !== false;
const noTests = opts.noTests || false;
const minConf = opts.minConfidence ?? DEFAULT_MIN_CONFIDENCE;
const lines = [
'digraph codegraph {',
' rankdir=LR;',
Expand All @@ -23,8 +26,9 @@ export function exportDOT(db, opts = {}) {
JOIN nodes n1 ON e.source_id = n1.id
JOIN nodes n2 ON e.target_id = n2.id
WHERE n1.file != n2.file AND e.kind IN ('imports', 'imports-type', 'calls')
AND e.confidence >= ?
`)
.all();
.all(minConf);
if (noTests) edges = edges.filter((e) => !isTestFile(e.source) && !isTestFile(e.target));

// Try to use directory nodes from DB (built by structure analysis)
Expand Down Expand Up @@ -102,8 +106,9 @@ export function exportDOT(db, opts = {}) {
JOIN nodes n2 ON e.target_id = n2.id
WHERE n1.kind IN ('function', 'method', 'class', 'interface', 'type', 'struct', 'enum', 'trait', 'record', 'module') AND n2.kind IN ('function', 'method', 'class', 'interface', 'type', 'struct', 'enum', 'trait', 'record', 'module')
AND e.kind = 'calls'
AND e.confidence >= ?
`)
.all();
.all(minConf);
if (noTests)
edges = edges.filter((e) => !isTestFile(e.source_file) && !isTestFile(e.target_file));

Expand All @@ -126,6 +131,7 @@ export function exportDOT(db, opts = {}) {
export function exportMermaid(db, opts = {}) {
const fileLevel = opts.fileLevel !== false;
const noTests = opts.noTests || false;
const minConf = opts.minConfidence ?? DEFAULT_MIN_CONFIDENCE;
const lines = ['graph LR'];

if (fileLevel) {
Expand All @@ -136,8 +142,9 @@ export function exportMermaid(db, opts = {}) {
JOIN nodes n1 ON e.source_id = n1.id
JOIN nodes n2 ON e.target_id = n2.id
WHERE n1.file != n2.file AND e.kind IN ('imports', 'imports-type', 'calls')
AND e.confidence >= ?
`)
.all();
.all(minConf);
if (noTests) edges = edges.filter((e) => !isTestFile(e.source) && !isTestFile(e.target));

for (const { source, target } of edges) {
Expand All @@ -155,8 +162,9 @@ export function exportMermaid(db, opts = {}) {
JOIN nodes n2 ON e.target_id = n2.id
WHERE n1.kind IN ('function', 'method', 'class', 'interface', 'type', 'struct', 'enum', 'trait', 'record', 'module') AND n2.kind IN ('function', 'method', 'class', 'interface', 'type', 'struct', 'enum', 'trait', 'record', 'module')
AND e.kind = 'calls'
AND e.confidence >= ?
`)
.all();
.all(minConf);
if (noTests)
edges = edges.filter((e) => !isTestFile(e.source_file) && !isTestFile(e.target_file));

Expand All @@ -175,6 +183,7 @@ export function exportMermaid(db, opts = {}) {
*/
export function exportJSON(db, opts = {}) {
const noTests = opts.noTests || false;
const minConf = opts.minConfidence ?? DEFAULT_MIN_CONFIDENCE;

let nodes = db
.prepare(`
Expand All @@ -185,13 +194,13 @@ export function exportJSON(db, opts = {}) {

let edges = db
.prepare(`
SELECT DISTINCT n1.file AS source, n2.file AS target, e.kind
SELECT DISTINCT n1.file AS source, n2.file AS target, e.kind, e.confidence
FROM edges e
JOIN nodes n1 ON e.source_id = n1.id
JOIN nodes n2 ON e.target_id = n2.id
WHERE n1.file != n2.file
WHERE n1.file != n2.file AND e.confidence >= ?
`)
.all();
.all(minConf);
if (noTests) edges = edges.filter((e) => !isTestFile(e.source) && !isTestFile(e.target));

return { nodes, edges };
Expand Down
3 changes: 2 additions & 1 deletion src/structure.js
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,8 @@ export function buildStructure(db, fileSymbols, _rootDir, lineCountMap, director
*/
export function structureData(customDbPath, opts = {}) {
const db = openReadonlyOrFail(customDbPath);
const filterDir = opts.directory || null;
const rawDir = opts.directory || null;
const filterDir = rawDir && normalizePath(rawDir) !== '.' ? rawDir : null;
const maxDepth = opts.depth || null;
const sortBy = opts.sort || 'files';
const noTests = opts.noTests || false;
Expand Down