Skip to content

Commit b065042

Browse files
committed
feat(03-03): add multi-codebase eval runner command
- add npm eval script that builds before running dist-based harness imports - upgrade eval runner for dual codebases, fixture-a/fixture-b overrides, and combined summary output
1 parent 5c5319b commit b065042

File tree

2 files changed

+179
-127
lines changed

2 files changed

+179
-127
lines changed

package.json

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,8 @@
113113
"format": "prettier --write \"src/**/*.ts\"",
114114
"format:check": "prettier --check \"src/**/*.ts\"",
115115
"type-check": "tsc --noEmit",
116-
"mcp:inspect": "npx -y @modelcontextprotocol/inspector node dist/index.js ."
116+
"mcp:inspect": "npx -y @modelcontextprotocol/inspector node dist/index.js .",
117+
"eval": "pnpm run build && node scripts/run-eval.mjs"
117118
},
118119
"dependencies": {
119120
"@huggingface/transformers": "^3.8.1",
@@ -124,8 +125,10 @@
124125
"glob": "^10.3.10",
125126
"hono": "4.11.7",
126127
"ignore": "^5.3.1",
128+
"tree-sitter-wasms": "^0.1.13",
127129
"typescript": "^5.3.3",
128130
"uuid": "^9.0.1",
131+
"web-tree-sitter": "^0.25.10",
129132
"zod": "^4.3.4"
130133
},
131134
"devDependencies": {
@@ -155,4 +158,4 @@
155158
"@modelcontextprotocol/sdk>ajv": "8.18.0"
156159
}
157160
}
158-
}
161+
}

scripts/run-eval.mjs

Lines changed: 174 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -1,168 +1,217 @@
11
#!/usr/bin/env node
2-
/**
3-
* Search quality evaluation runner (single canonical script).
4-
*
5-
* Re-indexes a target codebase with the current model+chunking settings
6-
* and runs the eval harness from tests/fixtures/eval-angular-spotify.json.
7-
* Paths in output are redacted by default for publishable logs; use
8-
* --no-redact for full paths (e.g. internal runs).
9-
*
10-
* Usage: node scripts/run-eval.mjs <path-to-codebase> [--skip-reindex] [--no-rerank] [--no-redact]
11-
*/
122

133
import path from 'path';
14-
import crypto from 'crypto';
15-
import { readFileSync } from 'fs';
164
import { fileURLToPath } from 'url';
5+
import { parseArgs } from 'util';
6+
import { readFileSync } from 'fs';
7+
import { existsSync } from 'fs';
178
import { CodebaseIndexer } from '../dist/core/indexer.js';
189
import { CodebaseSearcher } from '../dist/core/search.js';
1910
import { analyzerRegistry } from '../dist/core/analyzer-registry.js';
2011
import { AngularAnalyzer } from '../dist/analyzers/angular/index.js';
2112
import { GenericAnalyzer } from '../dist/analyzers/generic/index.js';
13+
import { evaluateFixture, formatEvalReport } from '../dist/eval/harness.js';
2214

2315
const __dirname = path.dirname(fileURLToPath(import.meta.url));
24-
const fixtureArg = process.argv.find(arg => arg.startsWith('--fixture='));
25-
const fixturePath = fixtureArg
26-
? path.resolve(fixtureArg.split('=')[1])
27-
: path.join(__dirname, '..', 'tests', 'fixtures', 'eval-angular-spotify.json');
28-
const evalFixture = JSON.parse(readFileSync(fixturePath, 'utf-8'));
16+
const projectRoot = path.join(__dirname, '..');
17+
const packageJsonPath = path.join(projectRoot, 'package.json');
18+
19+
const packageJson = JSON.parse(readFileSync(packageJsonPath, 'utf-8'));
20+
21+
const defaultFixtureA = path.join(projectRoot, 'tests', 'fixtures', 'eval-angular-spotify.json');
22+
const defaultFixtureB = path.join(projectRoot, 'tests', 'fixtures', 'eval-controlled.json');
23+
24+
const usage = [
25+
`Usage: node scripts/run-eval.mjs <codebaseA> [codebaseB] [options]`,
26+
``,
27+
`Options:`,
28+
` --fixture-a=<path> Override fixture for codebaseA`,
29+
` --fixture-b=<path> Override fixture for codebaseB`,
30+
` --skip-reindex Skip re-index phase`,
31+
` --no-rerank Disable ambiguity reranker`,
32+
` --no-redact Show full file paths in report`,
33+
` --help Show this help and exit`
34+
].join('\n');
2935

30-
// Register analyzers
3136
analyzerRegistry.register(new AngularAnalyzer());
3237
analyzerRegistry.register(new GenericAnalyzer());
3338

34-
function isTestFile(filePath) {
35-
const n = filePath.toLowerCase().replace(/\\/g, '/');
36-
return n.includes('.spec.') || n.includes('.test.') || n.includes('/e2e/') ||
37-
n.includes('/__tests__/');
38-
}
39-
40-
function matchesPattern(filePath, patterns) {
41-
const n = filePath.toLowerCase().replace(/\\/g, '/');
42-
return patterns.some(p => n.includes(p.toLowerCase()));
39+
function loadFixture(fixturePath) {
40+
const raw = readFileSync(fixturePath, 'utf-8');
41+
return JSON.parse(raw);
4342
}
4443

45-
function hashPath(filePath) {
46-
return crypto.createHash('sha1').update(filePath.toLowerCase()).digest('hex').slice(0, 8);
44+
function printHeader(version) {
45+
console.log(`\n=== codebase-context v${version} eval ===`);
46+
console.log(`Model: ${process.env.EMBEDDING_MODEL || 'Xenova/bge-small-en-v1.5 (default)'}`);
4747
}
4848

49-
function formatPath(filePath, redactPaths) {
50-
if (!filePath) return 'none';
51-
const normalized = filePath.replace(/\\/g, '/');
52-
if (!redactPaths) return normalized;
53-
const base = normalized.split('/').pop() || normalized;
54-
return `path#${hashPath(normalized)}/${base}`;
49+
function hasIndexArtifacts(rootPath) {
50+
const contextDir = path.join(rootPath, '.codebase-context');
51+
const keywordIndexPath = path.join(contextDir, 'index.json');
52+
const vectorDirPath = path.join(contextDir, 'index');
53+
return existsSync(keywordIndexPath) && existsSync(vectorDirPath);
5554
}
5655

57-
async function main() {
58-
const rootPath = process.argv[2];
59-
if (!rootPath) {
60-
console.error('Usage: node scripts/run-eval.mjs <path-to-codebase> [--skip-reindex] [--no-rerank] [--no-redact]');
61-
process.exit(1);
56+
async function maybeReindex(rootPath, skipReindex) {
57+
if (skipReindex && hasIndexArtifacts(rootPath)) {
58+
console.log(`\n--- Phase 1: Skipping re-index (--skip-reindex) ---`);
59+
return;
6260
}
6361

64-
const resolvedPath = path.resolve(rootPath);
65-
const redactPaths = !process.argv.includes('--no-redact');
66-
console.log(`\n=== v1.6.0 Search Quality Evaluation ===`);
67-
console.log(`Target: ${redactPaths ? `<repo#${hashPath(resolvedPath)}>` : resolvedPath}`);
68-
console.log(`Model: ${process.env.EMBEDDING_MODEL || 'Xenova/bge-small-en-v1.5 (default)'}`);
62+
if (skipReindex) {
63+
console.log(
64+
`\n--- Phase 1: --skip-reindex requested but no index artifacts found; running index build ---`
65+
);
66+
}
6967

70-
// Phase 1: Re-index
71-
const skipReindex = process.argv.includes('--skip-reindex');
72-
if (!skipReindex) {
73-
console.log(`\n--- Phase 1: Re-indexing ---`);
74-
const indexer = new CodebaseIndexer({
75-
rootPath: resolvedPath,
76-
onProgress: (p) => {
77-
if (p.phase === 'embedding' || p.phase === 'complete') {
78-
process.stderr.write(`\r[${p.phase}] ${p.percentage}% (${p.filesProcessed}/${p.totalFiles} files)`);
79-
}
68+
console.log(`\n--- Phase 1: Re-indexing ---`);
69+
const indexer = new CodebaseIndexer({
70+
rootPath,
71+
onProgress: (progress) => {
72+
if (progress.phase === 'embedding' || progress.phase === 'complete') {
73+
process.stderr.write(
74+
`\r[${progress.phase}] ${progress.percentage}% (${progress.filesProcessed}/${progress.totalFiles} files)`
75+
);
8076
}
81-
});
82-
const stats = await indexer.index();
83-
console.log(`\nIndexing complete: ${stats.indexedFiles} files, ${stats.totalChunks} chunks in ${stats.duration}ms`);
84-
} else {
85-
console.log(`\n--- Phase 1: Skipping re-index (--skip-reindex) ---`);
86-
}
77+
}
78+
});
8779

88-
// Phase 2: Run eval harness
89-
const noRerank = process.argv.includes('--no-rerank');
90-
console.log(`\n--- Phase 2: Running ${evalFixture.queries.length}-query eval harness ---`);
91-
console.log(`Reranker: ${noRerank ? 'DISABLED' : 'enabled (ambiguity-triggered, Xenova/ms-marco-MiniLM-L-6-v2)'}`);
92-
console.log(`File-level dedupe: enabled`);
80+
const stats = await indexer.index();
81+
console.log(
82+
`\nIndexing complete: ${stats.indexedFiles} files, ${stats.totalChunks} chunks in ${stats.duration}ms`
83+
);
84+
}
85+
86+
async function runSingleEvaluation({
87+
label,
88+
codebasePath,
89+
fixturePath,
90+
skipReindex,
91+
noRerank,
92+
redactPaths
93+
}) {
94+
const resolvedCodebase = path.resolve(codebasePath);
95+
const resolvedFixture = path.resolve(fixturePath);
96+
const fixture = loadFixture(resolvedFixture);
97+
98+
console.log(`\n=== Codebase: ${label} ===`);
99+
console.log(`Target: ${resolvedCodebase}`);
100+
console.log(`Fixture: ${resolvedFixture}`);
101+
console.log(
102+
`Reranker: ${noRerank ? 'DISABLED' : 'enabled (ambiguity-triggered, Xenova/ms-marco-MiniLM-L-6-v2)'}`
103+
);
93104
console.log(`Path output: ${redactPaths ? 'REDACTED' : 'FULL'}`);
94-
const searcher = new CodebaseSearcher(resolvedPath);
95105

96-
const queries = evalFixture.queries;
97-
let top1Correct = 0;
98-
let top3RecallCount = 0;
99-
let specContaminatedCount = 0;
106+
await maybeReindex(resolvedCodebase, skipReindex);
100107

101-
for (const q of queries) {
102-
// Search results are already file-level deduped by the engine
103-
const results = await searcher.search(q.query, 5, undefined, {
108+
console.log(`\n--- Phase 2: Running ${fixture.queries.length}-query eval harness ---`);
109+
const searcher = new CodebaseSearcher(resolvedCodebase);
110+
const summary = await evaluateFixture({
111+
fixture,
112+
searcher,
113+
limit: 5,
114+
searchOptions: {
104115
enableReranker: !noRerank
105-
});
106-
107-
const topFile = results.length > 0 ? results[0].filePath : null;
108-
const top3Files = results.slice(0, 3).map(r => r.filePath);
109-
const topScore = results.length > 0 ? results[0].score : 0;
110-
111-
// Evaluate (support both old and new fixture formats)
112-
const expectedPatterns = q.expectedPatterns || q.expectedTopFiles || [];
113-
const expectedNotPatterns = q.expectedNotPatterns || q.expectedNotTopFiles || [];
116+
}
117+
});
114118

115-
const top1Ok = topFile !== null &&
116-
matchesPattern(topFile, expectedPatterns) &&
117-
!matchesPattern(topFile, expectedNotPatterns);
119+
const report = formatEvalReport({
120+
codebaseLabel: label,
121+
fixturePath: resolvedFixture,
122+
summary,
123+
redactPaths
124+
});
118125

119-
const top3Ok = top3Files.some(
120-
f => matchesPattern(f, expectedPatterns) && !matchesPattern(f, expectedNotPatterns)
121-
);
126+
console.log(report);
127+
return summary;
128+
}
122129

123-
const specCount = top3Files.filter(f => isTestFile(f)).length;
124-
const contaminated = specCount >= 2;
130+
function printCombinedSummary(summaries) {
131+
const total = summaries.reduce((sum, summary) => sum + summary.total, 0);
132+
const top1Correct = summaries.reduce((sum, summary) => sum + summary.top1Correct, 0);
133+
const top3RecallCount = summaries.reduce((sum, summary) => sum + summary.top3RecallCount, 0);
134+
const specContaminatedCount = summaries.reduce(
135+
(sum, summary) => sum + summary.specContaminatedCount,
136+
0
137+
);
138+
139+
console.log(`\n=== Combined Summary ===`);
140+
console.log(
141+
`Top-1 Accuracy: ${top1Correct}/${total} (${((top1Correct / Math.max(total, 1)) * 100).toFixed(0)}%)`
142+
);
143+
console.log(
144+
`Top-3 Recall: ${top3RecallCount}/${total} (${((top3RecallCount / Math.max(total, 1)) * 100).toFixed(0)}%)`
145+
);
146+
console.log(
147+
`Spec Contamination: ${specContaminatedCount}/${total} (${((specContaminatedCount / Math.max(total, 1)) * 100).toFixed(0)}%)`
148+
);
149+
console.log(`========================\n`);
150+
}
125151

126-
if (top1Ok) top1Correct++;
127-
if (top3Ok) top3RecallCount++;
128-
if (contaminated) specContaminatedCount++;
152+
async function main() {
153+
const { values, positionals } = parseArgs({
154+
options: {
155+
help: { type: 'boolean', default: false },
156+
'skip-reindex': { type: 'boolean', default: false },
157+
'no-rerank': { type: 'boolean', default: false },
158+
'no-redact': { type: 'boolean', default: false },
159+
'fixture-a': { type: 'string' },
160+
'fixture-b': { type: 'string' }
161+
},
162+
allowPositionals: true
163+
});
164+
165+
if (values.help) {
166+
console.log(usage);
167+
process.exit(0);
168+
}
129169

130-
const statusIcon = top1Ok ? 'PASS' : 'FAIL';
131-
const topFileShort = formatPath(topFile, redactPaths);
132-
const contNote = contaminated ? ' [SPEC CONTAMINATED]' : '';
170+
if (positionals.length < 1 || positionals.length > 2) {
171+
console.error(usage);
172+
process.exit(1);
173+
}
133174

134-
console.log(` ${statusIcon} [${q.category}] #${q.id} "${q.query}"`);
135-
console.log(` -> ${topFileShort} (score: ${topScore.toFixed(3)})${contNote}`);
136-
if (!top1Ok && topFile) {
137-
console.log(` expected pattern: ${expectedPatterns.join(' | ')}`);
138-
}
175+
printHeader(packageJson.version);
176+
177+
const codebaseA = positionals[0];
178+
const codebaseB = positionals[1];
179+
const fixtureA = values['fixture-a'] ? path.resolve(values['fixture-a']) : defaultFixtureA;
180+
const fixtureB = values['fixture-b'] ? path.resolve(values['fixture-b']) : defaultFixtureB;
181+
182+
const sharedOptions = {
183+
skipReindex: values['skip-reindex'],
184+
noRerank: values['no-rerank'],
185+
redactPaths: !values['no-redact']
186+
};
187+
188+
const summaryA = await runSingleEvaluation({
189+
label: 'A',
190+
codebasePath: codebaseA,
191+
fixturePath: fixtureA,
192+
...sharedOptions
193+
});
194+
195+
const summaries = [summaryA];
196+
let passesAllGates = summaryA.passesGate;
197+
198+
if (codebaseB) {
199+
const summaryB = await runSingleEvaluation({
200+
label: 'B',
201+
codebasePath: codebaseB,
202+
fixturePath: fixtureB,
203+
...sharedOptions
204+
});
139205

140-
// Show top 3 for failures
141-
if (!top1Ok) {
142-
console.log(` top 3:`);
143-
top3Files.forEach((f, i) => {
144-
const short = formatPath(f, redactPaths);
145-
const score = results[i]?.score?.toFixed(3) || '?';
146-
console.log(` ${i + 1}. ${short} (${score})`);
147-
});
148-
}
206+
summaries.push(summaryB);
207+
passesAllGates = passesAllGates && summaryB.passesGate;
208+
printCombinedSummary(summaries);
149209
}
150210

151-
// Summary
152-
const total = queries.length;
153-
console.log(`\n=== RESULTS ===`);
154-
console.log(`Top-1 Accuracy: ${top1Correct}/${total} (${((top1Correct / total) * 100).toFixed(0)}%)`);
155-
console.log(`Top-3 Recall: ${top3RecallCount}/${total} (${((top3RecallCount / total) * 100).toFixed(0)}%)`);
156-
console.log(`Spec Contamination: ${specContaminatedCount}/${total} (${((specContaminatedCount / total) * 100).toFixed(0)}%)`);
157-
const gateThreshold = Math.ceil(total * 0.7);
158-
const passesGate = top1Correct >= gateThreshold;
159-
console.log(`Gate (${gateThreshold}/${total}):${' '.repeat(Math.max(1, 8 - String(gateThreshold).length - String(total).length))}${passesGate ? 'PASS' : 'FAIL'}`);
160-
console.log(`\n================================\n`);
161-
162-
process.exit(passesGate ? 0 : 1);
211+
process.exit(passesAllGates ? 0 : 1);
163212
}
164213

165-
main().catch((err) => {
166-
console.error('Fatal error:', err);
214+
main().catch((error) => {
215+
console.error('Fatal error:', error);
167216
process.exit(2);
168217
});

0 commit comments

Comments
 (0)