diff --git a/generated/DOGFOOD_REPORT_v2.2.0.md b/generated/DOGFOOD_REPORT_v2.2.0.md index 5b8de79b1..507b10a48 100644 --- a/generated/DOGFOOD_REPORT_v2.2.0.md +++ b/generated/DOGFOOD_REPORT_v2.2.0.md @@ -42,6 +42,8 @@ **Fix:** Treat `.` (or current dir equivalent) as `null`/no filter in `structureData()`. +> **FIXED** — `structureData()` now normalizes the directory argument and treats `"."` as null/no filter. (`src/structure.js`) + ### 2. Stale embeddings after rebuild (Medium severity) - After an incremental `build`, embedding `node_id`s become orphaned (e.g. old IDs in 3077-range, new IDs in 4335-range) @@ -52,6 +54,8 @@ **Fix:** Either preserve node IDs across rebuilds, invalidate embeddings when node IDs change, or warn the user to re-run `embed`. +> **FIXED** — Build now invalidates embeddings alongside nodes. Full builds clear the embeddings table entirely. Incremental builds delete embeddings for affected files before deleting their nodes (order matters — need node IDs to find them). After the build, any remaining orphaned embeddings trigger a warning: `"N embeddings are orphaned (nodes changed). Run codegraph embed to refresh."` (`src/builder.js`) + ### 3. `embed` default model requires HuggingFace auth (Medium severity) - `codegraph embed .` crashes with `Error: Unauthorized access to file` for the default `jina-code` model @@ -61,6 +65,8 @@ **Fix:** Either default to a public model (e.g. `minilm`), auto-fallback to `minilm` on auth failure, or catch the error and provide a clear message with instructions. +> **FIXED** — Default model changed from `nomic-v1.5` (gated, requires HF_TOKEN) to `minilm` (public, 23MB, always works). Additionally, `loadModel()` now catches auth/download failures and prints a clear message with options (set HF_TOKEN or use `--model minilm`) instead of crashing with a raw stack trace. (`src/embedder.js`, `src/cli.js`) + ### 4. Cross-language false positive in export (Low severity) - One low-confidence (0.3) call edge: `main` (build.rs) → `setup` (tests/unit/structure.test.js) @@ -69,6 +75,8 @@ **Fix:** Export commands could support a `--min-confidence` filter, or the default export could exclude edges below a threshold (e.g. 0.5). +> **FIXED** — Added `--min-confidence ` option to the `export` command (default: 0.5). All three formats (DOT, Mermaid, JSON) filter edges by confidence at the SQL level. The 0.3-confidence false positive is excluded by default. Users can pass `--min-confidence 0` to include all edges. (`src/export.js`, `src/cli.js`) + ## `--no-tests` Flag Tested on `stats` and `map` — both correctly filter out test files: @@ -80,3 +88,5 @@ Tested on `stats` and `map` — both correctly filter out test files: - `embed --model minilm` successfully generated 392 embeddings (384d) - `search "build graph"` returned 15 results after fresh embeddings (top hit: 37.9% `test_triangle_cycle`) - Search quality is reasonable but not ideal — `buildGraph` itself didn't appear in results for "build graph" + +> **FIXED** — Embedding text now includes a readable split of the identifier name (e.g. `buildGraph` → `"function buildGraph (build Graph) in src/builder.js"`). This lets the model naturally associate "build graph" queries with `buildGraph` without needing hybrid search. camelCase, PascalCase, snake_case, and kebab-case are all handled. (`src/embedder.js`) diff --git a/src/builder.js b/src/builder.js index 0fe449906..d48d9b8a9 100644 --- a/src/builder.js +++ b/src/builder.js @@ -402,13 +402,30 @@ export async function buildGraph(rootDir, opts = {}) { return; } + // Check if embeddings table exists (created by `embed`, not by initSchema) + let hasEmbeddings = false; + try { + db.prepare('SELECT 1 FROM embeddings LIMIT 1').get(); + hasEmbeddings = true; + } catch { + /* table doesn't exist */ + } + if (isFullBuild) { + const deletions = + 'PRAGMA foreign_keys = OFF; DELETE FROM node_metrics; DELETE FROM edges; DELETE FROM nodes; PRAGMA foreign_keys = ON;'; db.exec( - 'PRAGMA foreign_keys = OFF; DELETE FROM node_metrics; DELETE FROM edges; DELETE FROM nodes; PRAGMA foreign_keys = ON;', + hasEmbeddings + ? `${deletions.replace('PRAGMA foreign_keys = ON;', '')} DELETE FROM embeddings; PRAGMA foreign_keys = ON;` + : deletions, ); } else { info(`Incremental: ${parseChanges.length} changed, ${removed.length} removed`); - // Remove metrics/edges/nodes for changed and removed files + // Remove embeddings/metrics/edges/nodes for changed and removed files + // Embeddings must be deleted BEFORE nodes (we need node IDs to find them) + const deleteEmbeddingsForFile = hasEmbeddings + ? db.prepare('DELETE FROM embeddings WHERE node_id IN (SELECT id FROM nodes WHERE file = ?)') + : null; const deleteNodesForFile = db.prepare('DELETE FROM nodes WHERE file = ?'); const deleteEdgesForFile = db.prepare(` DELETE FROM edges WHERE source_id IN (SELECT id FROM nodes WHERE file = @f) @@ -418,12 +435,14 @@ export async function buildGraph(rootDir, opts = {}) { 'DELETE FROM node_metrics WHERE node_id IN (SELECT id FROM nodes WHERE file = ?)', ); for (const relPath of removed) { + deleteEmbeddingsForFile?.run(relPath); deleteEdgesForFile.run({ f: relPath }); deleteMetricsForFile.run(relPath); deleteNodesForFile.run(relPath); } for (const item of parseChanges) { const relPath = item.relPath || normalizePath(path.relative(rootDir, item.file)); + deleteEmbeddingsForFile?.run(relPath); deleteEdgesForFile.run({ f: relPath }); deleteMetricsForFile.run(relPath); deleteNodesForFile.run(relPath); @@ -823,6 +842,23 @@ export async function buildGraph(rootDir, opts = {}) { const nodeCount = db.prepare('SELECT COUNT(*) as c FROM nodes').get().c; info(`Graph built: ${nodeCount} nodes, ${edgeCount} edges`); info(`Stored in ${dbPath}`); + + // Warn about orphaned embeddings that no longer match any node + if (hasEmbeddings) { + try { + const orphaned = db + .prepare('SELECT COUNT(*) as c FROM embeddings WHERE node_id NOT IN (SELECT id FROM nodes)') + .get().c; + if (orphaned > 0) { + warn( + `${orphaned} embeddings are orphaned (nodes changed). Run "codegraph embed" to refresh.`, + ); + } + } catch { + /* ignore — embeddings table may have been dropped */ + } + } + db.close(); // Write journal header after successful build diff --git a/src/cli.js b/src/cli.js index 416423fc9..00c5431b5 100644 --- a/src/cli.js +++ b/src/cli.js @@ -272,10 +272,15 @@ program .option('--functions', 'Function-level graph instead of file-level') .option('-T, --no-tests', 'Exclude test/spec files') .option('--include-tests', 'Include test/spec files (overrides excludeTests config)') + .option('--min-confidence ', 'Minimum edge confidence threshold (default: 0.5)', '0.5') .option('-o, --output ', 'Write to file instead of stdout') .action((opts) => { const db = new Database(findDbPath(opts.db), { readonly: true }); - const exportOpts = { fileLevel: !opts.functions, noTests: resolveNoTests(opts) }; + const exportOpts = { + fileLevel: !opts.functions, + noTests: resolveNoTests(opts), + minConfidence: parseFloat(opts.minConfidence), + }; let output; switch (opts.format) { @@ -412,7 +417,7 @@ program .action(() => { console.log('\nAvailable embedding models:\n'); for (const [key, config] of Object.entries(MODELS)) { - const def = key === 'nomic-v1.5' ? ' (default)' : ''; + const def = key === 'minilm' ? ' (default)' : ''; console.log(` ${key.padEnd(12)} ${String(config.dim).padStart(4)}d ${config.desc}${def}`); } console.log('\nUsage: codegraph embed --model '); @@ -426,8 +431,8 @@ program ) .option( '-m, --model ', - 'Embedding model: minilm, jina-small, jina-base, jina-code, nomic, nomic-v1.5 (default), bge-large. Run `codegraph models` for details', - 'nomic-v1.5', + 'Embedding model: minilm (default), jina-small, jina-base, jina-code, nomic, nomic-v1.5, bge-large. Run `codegraph models` for details', + 'minilm', ) .action(async (dir, opts) => { const root = path.resolve(dir || '.'); diff --git a/src/embedder.js b/src/embedder.js index 1a813d2a0..6876a00e4 100644 --- a/src/embedder.js +++ b/src/embedder.js @@ -4,6 +4,18 @@ import Database from 'better-sqlite3'; import { findDbPath, openReadonlyOrFail } from './db.js'; import { warn } from './logger.js'; +/** + * Split an identifier into readable words. + * camelCase/PascalCase → "camel Case", snake_case → "snake case", kebab-case → "kebab case" + */ +function splitIdentifier(name) { + return name + .replace(/([a-z])([A-Z])/g, '$1 $2') + .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2') + .replace(/[_-]+/g, ' ') + .trim(); +} + // Lazy-load transformers (heavy, optional module) let pipeline = null; let _cos_sim = null; @@ -55,7 +67,7 @@ export const MODELS = { }, }; -export const DEFAULT_MODEL = 'nomic-v1.5'; +export const DEFAULT_MODEL = 'minilm'; const BATCH_SIZE_MAP = { minilm: 32, 'jina-small': 16, @@ -103,8 +115,27 @@ async function loadModel(modelKey) { _cos_sim = transformers.cos_sim; console.log(`Loading embedding model: ${config.name} (${config.dim}d)...`); - const opts = config.quantized ? { quantized: true } : {}; - extractor = await pipeline('feature-extraction', config.name, opts); + const pipelineOpts = config.quantized ? { quantized: true } : {}; + try { + extractor = await pipeline('feature-extraction', config.name, pipelineOpts); + } catch (err) { + const msg = err.message || String(err); + if (msg.includes('Unauthorized') || msg.includes('401') || msg.includes('gated')) { + console.error( + `\nModel "${config.name}" requires authentication.\n` + + `This model is gated on HuggingFace and needs an access token.\n\n` + + `Options:\n` + + ` 1. Set HF_TOKEN env var: export HF_TOKEN=hf_...\n` + + ` 2. Use a public model instead: codegraph embed --model minilm\n`, + ); + } else { + console.error( + `\nFailed to load model "${config.name}": ${msg}\n` + + `Try a different model: codegraph embed --model minilm\n`, + ); + } + process.exit(1); + } activeModel = config.name; console.log('Model loaded.'); return { extractor, config }; @@ -219,7 +250,8 @@ export async function buildEmbeddings(rootDir, modelKey, customDbPath) { : Math.min(lines.length, startLine + 15); const context = lines.slice(startLine, endLine).join('\n'); - const text = `${node.kind} ${node.name} in ${file}\n${context}`; + const readable = splitIdentifier(node.name); + const text = `${node.kind} ${node.name} (${readable}) in ${file}\n${context}`; texts.push(text); nodeIds.push(node.id); previews.push(`${node.name} (${node.kind}) -- ${file}:${node.line}`); diff --git a/src/export.js b/src/export.js index 2e8ab5ee5..55433ca06 100644 --- a/src/export.js +++ b/src/export.js @@ -1,12 +1,15 @@ import path from 'node:path'; import { isTestFile } from './queries.js'; +const DEFAULT_MIN_CONFIDENCE = 0.5; + /** * Export the dependency graph in DOT (Graphviz) format. */ export function exportDOT(db, opts = {}) { const fileLevel = opts.fileLevel !== false; const noTests = opts.noTests || false; + const minConf = opts.minConfidence ?? DEFAULT_MIN_CONFIDENCE; const lines = [ 'digraph codegraph {', ' rankdir=LR;', @@ -23,8 +26,9 @@ export function exportDOT(db, opts = {}) { JOIN nodes n1 ON e.source_id = n1.id JOIN nodes n2 ON e.target_id = n2.id WHERE n1.file != n2.file AND e.kind IN ('imports', 'imports-type', 'calls') + AND e.confidence >= ? `) - .all(); + .all(minConf); if (noTests) edges = edges.filter((e) => !isTestFile(e.source) && !isTestFile(e.target)); // Try to use directory nodes from DB (built by structure analysis) @@ -102,8 +106,9 @@ export function exportDOT(db, opts = {}) { JOIN nodes n2 ON e.target_id = n2.id WHERE n1.kind IN ('function', 'method', 'class', 'interface', 'type', 'struct', 'enum', 'trait', 'record', 'module') AND n2.kind IN ('function', 'method', 'class', 'interface', 'type', 'struct', 'enum', 'trait', 'record', 'module') AND e.kind = 'calls' + AND e.confidence >= ? `) - .all(); + .all(minConf); if (noTests) edges = edges.filter((e) => !isTestFile(e.source_file) && !isTestFile(e.target_file)); @@ -126,6 +131,7 @@ export function exportDOT(db, opts = {}) { export function exportMermaid(db, opts = {}) { const fileLevel = opts.fileLevel !== false; const noTests = opts.noTests || false; + const minConf = opts.minConfidence ?? DEFAULT_MIN_CONFIDENCE; const lines = ['graph LR']; if (fileLevel) { @@ -136,8 +142,9 @@ export function exportMermaid(db, opts = {}) { JOIN nodes n1 ON e.source_id = n1.id JOIN nodes n2 ON e.target_id = n2.id WHERE n1.file != n2.file AND e.kind IN ('imports', 'imports-type', 'calls') + AND e.confidence >= ? `) - .all(); + .all(minConf); if (noTests) edges = edges.filter((e) => !isTestFile(e.source) && !isTestFile(e.target)); for (const { source, target } of edges) { @@ -155,8 +162,9 @@ export function exportMermaid(db, opts = {}) { JOIN nodes n2 ON e.target_id = n2.id WHERE n1.kind IN ('function', 'method', 'class', 'interface', 'type', 'struct', 'enum', 'trait', 'record', 'module') AND n2.kind IN ('function', 'method', 'class', 'interface', 'type', 'struct', 'enum', 'trait', 'record', 'module') AND e.kind = 'calls' + AND e.confidence >= ? `) - .all(); + .all(minConf); if (noTests) edges = edges.filter((e) => !isTestFile(e.source_file) && !isTestFile(e.target_file)); @@ -175,6 +183,7 @@ export function exportMermaid(db, opts = {}) { */ export function exportJSON(db, opts = {}) { const noTests = opts.noTests || false; + const minConf = opts.minConfidence ?? DEFAULT_MIN_CONFIDENCE; let nodes = db .prepare(` @@ -185,13 +194,13 @@ export function exportJSON(db, opts = {}) { let edges = db .prepare(` - SELECT DISTINCT n1.file AS source, n2.file AS target, e.kind + SELECT DISTINCT n1.file AS source, n2.file AS target, e.kind, e.confidence FROM edges e JOIN nodes n1 ON e.source_id = n1.id JOIN nodes n2 ON e.target_id = n2.id - WHERE n1.file != n2.file + WHERE n1.file != n2.file AND e.confidence >= ? `) - .all(); + .all(minConf); if (noTests) edges = edges.filter((e) => !isTestFile(e.source) && !isTestFile(e.target)); return { nodes, edges }; diff --git a/src/structure.js b/src/structure.js index 5f45feb7e..ba348f374 100644 --- a/src/structure.js +++ b/src/structure.js @@ -231,7 +231,8 @@ export function buildStructure(db, fileSymbols, _rootDir, lineCountMap, director */ export function structureData(customDbPath, opts = {}) { const db = openReadonlyOrFail(customDbPath); - const filterDir = opts.directory || null; + const rawDir = opts.directory || null; + const filterDir = rawDir && normalizePath(rawDir) !== '.' ? rawDir : null; const maxDepth = opts.depth || null; const sortBy = opts.sort || 'files'; const noTests = opts.noTests || false;