|
1 | 1 | #!/usr/bin/env node |
2 | | -/** |
3 | | - * Search quality evaluation runner (single canonical script). |
4 | | - * |
5 | | - * Re-indexes a target codebase with the current model+chunking settings |
6 | | - * and runs the eval harness from tests/fixtures/eval-angular-spotify.json. |
7 | | - * Paths in output are redacted by default for publishable logs; use |
8 | | - * --no-redact for full paths (e.g. internal runs). |
9 | | - * |
10 | | - * Usage: node scripts/run-eval.mjs <path-to-codebase> [--skip-reindex] [--no-rerank] [--no-redact] |
11 | | - */ |
12 | 2 |
|
13 | 3 | import path from 'path'; |
14 | | -import crypto from 'crypto'; |
15 | | -import { readFileSync } from 'fs'; |
16 | 4 | import { fileURLToPath } from 'url'; |
| 5 | +import { parseArgs } from 'util'; |
| 6 | +import { readFileSync } from 'fs'; |
| 7 | +import { existsSync } from 'fs'; |
17 | 8 | import { CodebaseIndexer } from '../dist/core/indexer.js'; |
18 | 9 | import { CodebaseSearcher } from '../dist/core/search.js'; |
19 | 10 | import { analyzerRegistry } from '../dist/core/analyzer-registry.js'; |
20 | 11 | import { AngularAnalyzer } from '../dist/analyzers/angular/index.js'; |
21 | 12 | import { GenericAnalyzer } from '../dist/analyzers/generic/index.js'; |
| 13 | +import { evaluateFixture, formatEvalReport } from '../dist/eval/harness.js'; |
22 | 14 |
|
23 | 15 | const __dirname = path.dirname(fileURLToPath(import.meta.url)); |
24 | | -const fixtureArg = process.argv.find(arg => arg.startsWith('--fixture=')); |
25 | | -const fixturePath = fixtureArg |
26 | | - ? path.resolve(fixtureArg.split('=')[1]) |
27 | | - : path.join(__dirname, '..', 'tests', 'fixtures', 'eval-angular-spotify.json'); |
28 | | -const evalFixture = JSON.parse(readFileSync(fixturePath, 'utf-8')); |
| 16 | +const projectRoot = path.join(__dirname, '..'); |
| 17 | +const packageJsonPath = path.join(projectRoot, 'package.json'); |
| 18 | + |
| 19 | +const packageJson = JSON.parse(readFileSync(packageJsonPath, 'utf-8')); |
| 20 | + |
| 21 | +const defaultFixtureA = path.join(projectRoot, 'tests', 'fixtures', 'eval-angular-spotify.json'); |
| 22 | +const defaultFixtureB = path.join(projectRoot, 'tests', 'fixtures', 'eval-controlled.json'); |
| 23 | + |
| 24 | +const usage = [ |
| 25 | + `Usage: node scripts/run-eval.mjs <codebaseA> [codebaseB] [options]`, |
| 26 | + ``, |
| 27 | + `Options:`, |
| 28 | + ` --fixture-a=<path> Override fixture for codebaseA`, |
| 29 | + ` --fixture-b=<path> Override fixture for codebaseB`, |
| 30 | + ` --skip-reindex Skip re-index phase`, |
| 31 | + ` --no-rerank Disable ambiguity reranker`, |
| 32 | + ` --no-redact Show full file paths in report`, |
| 33 | + ` --help Show this help and exit` |
| 34 | +].join('\n'); |
29 | 35 |
|
30 | | -// Register analyzers |
31 | 36 | analyzerRegistry.register(new AngularAnalyzer()); |
32 | 37 | analyzerRegistry.register(new GenericAnalyzer()); |
33 | 38 |
|
34 | | -function isTestFile(filePath) { |
35 | | - const n = filePath.toLowerCase().replace(/\\/g, '/'); |
36 | | - return n.includes('.spec.') || n.includes('.test.') || n.includes('/e2e/') || |
37 | | - n.includes('/__tests__/'); |
38 | | -} |
39 | | - |
40 | | -function matchesPattern(filePath, patterns) { |
41 | | - const n = filePath.toLowerCase().replace(/\\/g, '/'); |
42 | | - return patterns.some(p => n.includes(p.toLowerCase())); |
| 39 | +function loadFixture(fixturePath) { |
| 40 | + const raw = readFileSync(fixturePath, 'utf-8'); |
| 41 | + return JSON.parse(raw); |
43 | 42 | } |
44 | 43 |
|
45 | | -function hashPath(filePath) { |
46 | | - return crypto.createHash('sha1').update(filePath.toLowerCase()).digest('hex').slice(0, 8); |
| 44 | +function printHeader(version) { |
| 45 | + console.log(`\n=== codebase-context v${version} eval ===`); |
| 46 | + console.log(`Model: ${process.env.EMBEDDING_MODEL || 'Xenova/bge-small-en-v1.5 (default)'}`); |
47 | 47 | } |
48 | 48 |
|
49 | | -function formatPath(filePath, redactPaths) { |
50 | | - if (!filePath) return 'none'; |
51 | | - const normalized = filePath.replace(/\\/g, '/'); |
52 | | - if (!redactPaths) return normalized; |
53 | | - const base = normalized.split('/').pop() || normalized; |
54 | | - return `path#${hashPath(normalized)}/${base}`; |
| 49 | +function hasIndexArtifacts(rootPath) { |
| 50 | + const contextDir = path.join(rootPath, '.codebase-context'); |
| 51 | + const keywordIndexPath = path.join(contextDir, 'index.json'); |
| 52 | + const vectorDirPath = path.join(contextDir, 'index'); |
| 53 | + return existsSync(keywordIndexPath) && existsSync(vectorDirPath); |
55 | 54 | } |
56 | 55 |
|
57 | | -async function main() { |
58 | | - const rootPath = process.argv[2]; |
59 | | - if (!rootPath) { |
60 | | - console.error('Usage: node scripts/run-eval.mjs <path-to-codebase> [--skip-reindex] [--no-rerank] [--no-redact]'); |
61 | | - process.exit(1); |
| 56 | +async function maybeReindex(rootPath, skipReindex) { |
| 57 | + if (skipReindex && hasIndexArtifacts(rootPath)) { |
| 58 | + console.log(`\n--- Phase 1: Skipping re-index (--skip-reindex) ---`); |
| 59 | + return; |
62 | 60 | } |
63 | 61 |
|
64 | | - const resolvedPath = path.resolve(rootPath); |
65 | | - const redactPaths = !process.argv.includes('--no-redact'); |
66 | | - console.log(`\n=== v1.6.0 Search Quality Evaluation ===`); |
67 | | - console.log(`Target: ${redactPaths ? `<repo#${hashPath(resolvedPath)}>` : resolvedPath}`); |
68 | | - console.log(`Model: ${process.env.EMBEDDING_MODEL || 'Xenova/bge-small-en-v1.5 (default)'}`); |
| 62 | + if (skipReindex) { |
| 63 | + console.log( |
| 64 | + `\n--- Phase 1: --skip-reindex requested but no index artifacts found; running index build ---` |
| 65 | + ); |
| 66 | + } |
69 | 67 |
|
70 | | - // Phase 1: Re-index |
71 | | - const skipReindex = process.argv.includes('--skip-reindex'); |
72 | | - if (!skipReindex) { |
73 | | - console.log(`\n--- Phase 1: Re-indexing ---`); |
74 | | - const indexer = new CodebaseIndexer({ |
75 | | - rootPath: resolvedPath, |
76 | | - onProgress: (p) => { |
77 | | - if (p.phase === 'embedding' || p.phase === 'complete') { |
78 | | - process.stderr.write(`\r[${p.phase}] ${p.percentage}% (${p.filesProcessed}/${p.totalFiles} files)`); |
79 | | - } |
| 68 | + console.log(`\n--- Phase 1: Re-indexing ---`); |
| 69 | + const indexer = new CodebaseIndexer({ |
| 70 | + rootPath, |
| 71 | + onProgress: (progress) => { |
| 72 | + if (progress.phase === 'embedding' || progress.phase === 'complete') { |
| 73 | + process.stderr.write( |
| 74 | + `\r[${progress.phase}] ${progress.percentage}% (${progress.filesProcessed}/${progress.totalFiles} files)` |
| 75 | + ); |
80 | 76 | } |
81 | | - }); |
82 | | - const stats = await indexer.index(); |
83 | | - console.log(`\nIndexing complete: ${stats.indexedFiles} files, ${stats.totalChunks} chunks in ${stats.duration}ms`); |
84 | | - } else { |
85 | | - console.log(`\n--- Phase 1: Skipping re-index (--skip-reindex) ---`); |
86 | | - } |
| 77 | + } |
| 78 | + }); |
87 | 79 |
|
88 | | - // Phase 2: Run eval harness |
89 | | - const noRerank = process.argv.includes('--no-rerank'); |
90 | | - console.log(`\n--- Phase 2: Running ${evalFixture.queries.length}-query eval harness ---`); |
91 | | - console.log(`Reranker: ${noRerank ? 'DISABLED' : 'enabled (ambiguity-triggered, Xenova/ms-marco-MiniLM-L-6-v2)'}`); |
92 | | - console.log(`File-level dedupe: enabled`); |
| 80 | + const stats = await indexer.index(); |
| 81 | + console.log( |
| 82 | + `\nIndexing complete: ${stats.indexedFiles} files, ${stats.totalChunks} chunks in ${stats.duration}ms` |
| 83 | + ); |
| 84 | +} |
| 85 | + |
| 86 | +async function runSingleEvaluation({ |
| 87 | + label, |
| 88 | + codebasePath, |
| 89 | + fixturePath, |
| 90 | + skipReindex, |
| 91 | + noRerank, |
| 92 | + redactPaths |
| 93 | +}) { |
| 94 | + const resolvedCodebase = path.resolve(codebasePath); |
| 95 | + const resolvedFixture = path.resolve(fixturePath); |
| 96 | + const fixture = loadFixture(resolvedFixture); |
| 97 | + |
| 98 | + console.log(`\n=== Codebase: ${label} ===`); |
| 99 | + console.log(`Target: ${resolvedCodebase}`); |
| 100 | + console.log(`Fixture: ${resolvedFixture}`); |
| 101 | + console.log( |
| 102 | + `Reranker: ${noRerank ? 'DISABLED' : 'enabled (ambiguity-triggered, Xenova/ms-marco-MiniLM-L-6-v2)'}` |
| 103 | + ); |
93 | 104 | console.log(`Path output: ${redactPaths ? 'REDACTED' : 'FULL'}`); |
94 | | - const searcher = new CodebaseSearcher(resolvedPath); |
95 | 105 |
|
96 | | - const queries = evalFixture.queries; |
97 | | - let top1Correct = 0; |
98 | | - let top3RecallCount = 0; |
99 | | - let specContaminatedCount = 0; |
| 106 | + await maybeReindex(resolvedCodebase, skipReindex); |
100 | 107 |
|
101 | | - for (const q of queries) { |
102 | | - // Search results are already file-level deduped by the engine |
103 | | - const results = await searcher.search(q.query, 5, undefined, { |
| 108 | + console.log(`\n--- Phase 2: Running ${fixture.queries.length}-query eval harness ---`); |
| 109 | + const searcher = new CodebaseSearcher(resolvedCodebase); |
| 110 | + const summary = await evaluateFixture({ |
| 111 | + fixture, |
| 112 | + searcher, |
| 113 | + limit: 5, |
| 114 | + searchOptions: { |
104 | 115 | enableReranker: !noRerank |
105 | | - }); |
106 | | - |
107 | | - const topFile = results.length > 0 ? results[0].filePath : null; |
108 | | - const top3Files = results.slice(0, 3).map(r => r.filePath); |
109 | | - const topScore = results.length > 0 ? results[0].score : 0; |
110 | | - |
111 | | - // Evaluate (support both old and new fixture formats) |
112 | | - const expectedPatterns = q.expectedPatterns || q.expectedTopFiles || []; |
113 | | - const expectedNotPatterns = q.expectedNotPatterns || q.expectedNotTopFiles || []; |
| 116 | + } |
| 117 | + }); |
114 | 118 |
|
115 | | - const top1Ok = topFile !== null && |
116 | | - matchesPattern(topFile, expectedPatterns) && |
117 | | - !matchesPattern(topFile, expectedNotPatterns); |
| 119 | + const report = formatEvalReport({ |
| 120 | + codebaseLabel: label, |
| 121 | + fixturePath: resolvedFixture, |
| 122 | + summary, |
| 123 | + redactPaths |
| 124 | + }); |
118 | 125 |
|
119 | | - const top3Ok = top3Files.some( |
120 | | - f => matchesPattern(f, expectedPatterns) && !matchesPattern(f, expectedNotPatterns) |
121 | | - ); |
| 126 | + console.log(report); |
| 127 | + return summary; |
| 128 | +} |
122 | 129 |
|
123 | | - const specCount = top3Files.filter(f => isTestFile(f)).length; |
124 | | - const contaminated = specCount >= 2; |
| 130 | +function printCombinedSummary(summaries) { |
| 131 | + const total = summaries.reduce((sum, summary) => sum + summary.total, 0); |
| 132 | + const top1Correct = summaries.reduce((sum, summary) => sum + summary.top1Correct, 0); |
| 133 | + const top3RecallCount = summaries.reduce((sum, summary) => sum + summary.top3RecallCount, 0); |
| 134 | + const specContaminatedCount = summaries.reduce( |
| 135 | + (sum, summary) => sum + summary.specContaminatedCount, |
| 136 | + 0 |
| 137 | + ); |
| 138 | + |
| 139 | + console.log(`\n=== Combined Summary ===`); |
| 140 | + console.log( |
| 141 | + `Top-1 Accuracy: ${top1Correct}/${total} (${((top1Correct / Math.max(total, 1)) * 100).toFixed(0)}%)` |
| 142 | + ); |
| 143 | + console.log( |
| 144 | + `Top-3 Recall: ${top3RecallCount}/${total} (${((top3RecallCount / Math.max(total, 1)) * 100).toFixed(0)}%)` |
| 145 | + ); |
| 146 | + console.log( |
| 147 | + `Spec Contamination: ${specContaminatedCount}/${total} (${((specContaminatedCount / Math.max(total, 1)) * 100).toFixed(0)}%)` |
| 148 | + ); |
| 149 | + console.log(`========================\n`); |
| 150 | +} |
125 | 151 |
|
126 | | - if (top1Ok) top1Correct++; |
127 | | - if (top3Ok) top3RecallCount++; |
128 | | - if (contaminated) specContaminatedCount++; |
| 152 | +async function main() { |
| 153 | + const { values, positionals } = parseArgs({ |
| 154 | + options: { |
| 155 | + help: { type: 'boolean', default: false }, |
| 156 | + 'skip-reindex': { type: 'boolean', default: false }, |
| 157 | + 'no-rerank': { type: 'boolean', default: false }, |
| 158 | + 'no-redact': { type: 'boolean', default: false }, |
| 159 | + 'fixture-a': { type: 'string' }, |
| 160 | + 'fixture-b': { type: 'string' } |
| 161 | + }, |
| 162 | + allowPositionals: true |
| 163 | + }); |
| 164 | + |
| 165 | + if (values.help) { |
| 166 | + console.log(usage); |
| 167 | + process.exit(0); |
| 168 | + } |
129 | 169 |
|
130 | | - const statusIcon = top1Ok ? 'PASS' : 'FAIL'; |
131 | | - const topFileShort = formatPath(topFile, redactPaths); |
132 | | - const contNote = contaminated ? ' [SPEC CONTAMINATED]' : ''; |
| 170 | + if (positionals.length < 1 || positionals.length > 2) { |
| 171 | + console.error(usage); |
| 172 | + process.exit(1); |
| 173 | + } |
133 | 174 |
|
134 | | - console.log(` ${statusIcon} [${q.category}] #${q.id} "${q.query}"`); |
135 | | - console.log(` -> ${topFileShort} (score: ${topScore.toFixed(3)})${contNote}`); |
136 | | - if (!top1Ok && topFile) { |
137 | | - console.log(` expected pattern: ${expectedPatterns.join(' | ')}`); |
138 | | - } |
| 175 | + printHeader(packageJson.version); |
| 176 | + |
| 177 | + const codebaseA = positionals[0]; |
| 178 | + const codebaseB = positionals[1]; |
| 179 | + const fixtureA = values['fixture-a'] ? path.resolve(values['fixture-a']) : defaultFixtureA; |
| 180 | + const fixtureB = values['fixture-b'] ? path.resolve(values['fixture-b']) : defaultFixtureB; |
| 181 | + |
| 182 | + const sharedOptions = { |
| 183 | + skipReindex: values['skip-reindex'], |
| 184 | + noRerank: values['no-rerank'], |
| 185 | + redactPaths: !values['no-redact'] |
| 186 | + }; |
| 187 | + |
| 188 | + const summaryA = await runSingleEvaluation({ |
| 189 | + label: 'A', |
| 190 | + codebasePath: codebaseA, |
| 191 | + fixturePath: fixtureA, |
| 192 | + ...sharedOptions |
| 193 | + }); |
| 194 | + |
| 195 | + const summaries = [summaryA]; |
| 196 | + let passesAllGates = summaryA.passesGate; |
| 197 | + |
| 198 | + if (codebaseB) { |
| 199 | + const summaryB = await runSingleEvaluation({ |
| 200 | + label: 'B', |
| 201 | + codebasePath: codebaseB, |
| 202 | + fixturePath: fixtureB, |
| 203 | + ...sharedOptions |
| 204 | + }); |
139 | 205 |
|
140 | | - // Show top 3 for failures |
141 | | - if (!top1Ok) { |
142 | | - console.log(` top 3:`); |
143 | | - top3Files.forEach((f, i) => { |
144 | | - const short = formatPath(f, redactPaths); |
145 | | - const score = results[i]?.score?.toFixed(3) || '?'; |
146 | | - console.log(` ${i + 1}. ${short} (${score})`); |
147 | | - }); |
148 | | - } |
| 206 | + summaries.push(summaryB); |
| 207 | + passesAllGates = passesAllGates && summaryB.passesGate; |
| 208 | + printCombinedSummary(summaries); |
149 | 209 | } |
150 | 210 |
|
151 | | - // Summary |
152 | | - const total = queries.length; |
153 | | - console.log(`\n=== RESULTS ===`); |
154 | | - console.log(`Top-1 Accuracy: ${top1Correct}/${total} (${((top1Correct / total) * 100).toFixed(0)}%)`); |
155 | | - console.log(`Top-3 Recall: ${top3RecallCount}/${total} (${((top3RecallCount / total) * 100).toFixed(0)}%)`); |
156 | | - console.log(`Spec Contamination: ${specContaminatedCount}/${total} (${((specContaminatedCount / total) * 100).toFixed(0)}%)`); |
157 | | - const gateThreshold = Math.ceil(total * 0.7); |
158 | | - const passesGate = top1Correct >= gateThreshold; |
159 | | - console.log(`Gate (${gateThreshold}/${total}):${' '.repeat(Math.max(1, 8 - String(gateThreshold).length - String(total).length))}${passesGate ? 'PASS' : 'FAIL'}`); |
160 | | - console.log(`\n================================\n`); |
161 | | - |
162 | | - process.exit(passesGate ? 0 : 1); |
| 211 | + process.exit(passesAllGates ? 0 : 1); |
163 | 212 | } |
164 | 213 |
|
165 | | -main().catch((err) => { |
166 | | - console.error('Fatal error:', err); |
| 214 | +main().catch((error) => { |
| 215 | + console.error('Fatal error:', error); |
167 | 216 | process.exit(2); |
168 | 217 | }); |
0 commit comments