|
| 1 | +#!/usr/bin/env node |
| 2 | +/** |
| 3 | + * Search quality evaluation runner (single canonical script). |
| 4 | + * |
| 5 | + * Re-indexes a target codebase with the current model+chunking settings |
| 6 | + * and runs the eval harness from tests/fixtures/eval-angular-spotify.json. |
| 7 | + * Paths in output are redacted by default for publishable logs; use |
| 8 | + * --no-redact for full paths (e.g. internal runs). |
| 9 | + * |
| 10 | + * Usage: node scripts/run-eval.mjs <path-to-codebase> [--skip-reindex] [--no-rerank] [--no-redact] |
| 11 | + */ |
| 12 | + |
| 13 | +import path from 'path'; |
| 14 | +import crypto from 'crypto'; |
| 15 | +import { readFileSync } from 'fs'; |
| 16 | +import { fileURLToPath } from 'url'; |
| 17 | +import { CodebaseIndexer } from '../dist/core/indexer.js'; |
| 18 | +import { CodebaseSearcher } from '../dist/core/search.js'; |
| 19 | +import { analyzerRegistry } from '../dist/core/analyzer-registry.js'; |
| 20 | +import { AngularAnalyzer } from '../dist/analyzers/angular/index.js'; |
| 21 | +import { GenericAnalyzer } from '../dist/analyzers/generic/index.js'; |
| 22 | + |
| 23 | +const __dirname = path.dirname(fileURLToPath(import.meta.url)); |
| 24 | +const fixtureArg = process.argv.find(arg => arg.startsWith('--fixture=')); |
| 25 | +const fixturePath = fixtureArg |
| 26 | + ? path.resolve(fixtureArg.split('=')[1]) |
| 27 | + : path.join(__dirname, '..', 'tests', 'fixtures', 'eval-angular-spotify.json'); |
| 28 | +const evalFixture = JSON.parse(readFileSync(fixturePath, 'utf-8')); |
| 29 | + |
| 30 | +// Register analyzers |
| 31 | +analyzerRegistry.register(new AngularAnalyzer()); |
| 32 | +analyzerRegistry.register(new GenericAnalyzer()); |
| 33 | + |
| 34 | +function isTestFile(filePath) { |
| 35 | + const n = filePath.toLowerCase().replace(/\\/g, '/'); |
| 36 | + return n.includes('.spec.') || n.includes('.test.') || n.includes('/e2e/') || |
| 37 | + n.includes('/__tests__/'); |
| 38 | +} |
| 39 | + |
| 40 | +function matchesPattern(filePath, patterns) { |
| 41 | + const n = filePath.toLowerCase().replace(/\\/g, '/'); |
| 42 | + return patterns.some(p => n.includes(p.toLowerCase())); |
| 43 | +} |
| 44 | + |
| 45 | +function hashPath(filePath) { |
| 46 | + return crypto.createHash('sha1').update(filePath.toLowerCase()).digest('hex').slice(0, 8); |
| 47 | +} |
| 48 | + |
| 49 | +function formatPath(filePath, redactPaths) { |
| 50 | + if (!filePath) return 'none'; |
| 51 | + const normalized = filePath.replace(/\\/g, '/'); |
| 52 | + if (!redactPaths) return normalized; |
| 53 | + const base = normalized.split('/').pop() || normalized; |
| 54 | + return `path#${hashPath(normalized)}/${base}`; |
| 55 | +} |
| 56 | + |
| 57 | +async function main() { |
| 58 | + const rootPath = process.argv[2]; |
| 59 | + if (!rootPath) { |
| 60 | + console.error('Usage: node scripts/run-eval.mjs <path-to-codebase> [--skip-reindex] [--no-rerank] [--no-redact]'); |
| 61 | + process.exit(1); |
| 62 | + } |
| 63 | + |
| 64 | + const resolvedPath = path.resolve(rootPath); |
| 65 | + const redactPaths = !process.argv.includes('--no-redact'); |
| 66 | + console.log(`\n=== v1.6.0 Search Quality Evaluation ===`); |
| 67 | + console.log(`Target: ${redactPaths ? `<repo#${hashPath(resolvedPath)}>` : resolvedPath}`); |
| 68 | + console.log(`Model: ${process.env.EMBEDDING_MODEL || 'Xenova/bge-small-en-v1.5 (default)'}`); |
| 69 | + |
| 70 | + // Phase 1: Re-index |
| 71 | + const skipReindex = process.argv.includes('--skip-reindex'); |
| 72 | + if (!skipReindex) { |
| 73 | + console.log(`\n--- Phase 1: Re-indexing ---`); |
| 74 | + const indexer = new CodebaseIndexer({ |
| 75 | + rootPath: resolvedPath, |
| 76 | + onProgress: (p) => { |
| 77 | + if (p.phase === 'embedding' || p.phase === 'complete') { |
| 78 | + process.stderr.write(`\r[${p.phase}] ${p.percentage}% (${p.filesProcessed}/${p.totalFiles} files)`); |
| 79 | + } |
| 80 | + } |
| 81 | + }); |
| 82 | + const stats = await indexer.index(); |
| 83 | + console.log(`\nIndexing complete: ${stats.indexedFiles} files, ${stats.totalChunks} chunks in ${stats.duration}ms`); |
| 84 | + } else { |
| 85 | + console.log(`\n--- Phase 1: Skipping re-index (--skip-reindex) ---`); |
| 86 | + } |
| 87 | + |
| 88 | + // Phase 2: Run eval harness |
| 89 | + const noRerank = process.argv.includes('--no-rerank'); |
| 90 | + console.log(`\n--- Phase 2: Running ${evalFixture.queries.length}-query eval harness ---`); |
| 91 | + console.log(`Reranker: ${noRerank ? 'DISABLED' : 'enabled (ambiguity-triggered, Xenova/ms-marco-MiniLM-L-6-v2)'}`); |
| 92 | + console.log(`File-level dedupe: enabled`); |
| 93 | + console.log(`Path output: ${redactPaths ? 'REDACTED' : 'FULL'}`); |
| 94 | + const searcher = new CodebaseSearcher(resolvedPath); |
| 95 | + |
| 96 | + const queries = evalFixture.queries; |
| 97 | + let top1Correct = 0; |
| 98 | + let top3RecallCount = 0; |
| 99 | + let specContaminatedCount = 0; |
| 100 | + |
| 101 | + for (const q of queries) { |
| 102 | + // Search results are already file-level deduped by the engine |
| 103 | + const results = await searcher.search(q.query, 5, undefined, { |
| 104 | + enableReranker: !noRerank |
| 105 | + }); |
| 106 | + |
| 107 | + const topFile = results.length > 0 ? results[0].filePath : null; |
| 108 | + const top3Files = results.slice(0, 3).map(r => r.filePath); |
| 109 | + const topScore = results.length > 0 ? results[0].score : 0; |
| 110 | + |
| 111 | + // Evaluate (support both old and new fixture formats) |
| 112 | + const expectedPatterns = q.expectedPatterns || q.expectedTopFiles || []; |
| 113 | + const expectedNotPatterns = q.expectedNotPatterns || q.expectedNotTopFiles || []; |
| 114 | + |
| 115 | + const top1Ok = topFile !== null && |
| 116 | + matchesPattern(topFile, expectedPatterns) && |
| 117 | + !matchesPattern(topFile, expectedNotPatterns); |
| 118 | + |
| 119 | + const top3Ok = top3Files.some( |
| 120 | + f => matchesPattern(f, expectedPatterns) && !matchesPattern(f, expectedNotPatterns) |
| 121 | + ); |
| 122 | + |
| 123 | + const specCount = top3Files.filter(f => isTestFile(f)).length; |
| 124 | + const contaminated = specCount >= 2; |
| 125 | + |
| 126 | + if (top1Ok) top1Correct++; |
| 127 | + if (top3Ok) top3RecallCount++; |
| 128 | + if (contaminated) specContaminatedCount++; |
| 129 | + |
| 130 | + const statusIcon = top1Ok ? 'PASS' : 'FAIL'; |
| 131 | + const topFileShort = formatPath(topFile, redactPaths); |
| 132 | + const contNote = contaminated ? ' [SPEC CONTAMINATED]' : ''; |
| 133 | + |
| 134 | + console.log(` ${statusIcon} [${q.category}] #${q.id} "${q.query}"`); |
| 135 | + console.log(` -> ${topFileShort} (score: ${topScore.toFixed(3)})${contNote}`); |
| 136 | + if (!top1Ok && topFile) { |
| 137 | + console.log(` expected pattern: ${expectedPatterns.join(' | ')}`); |
| 138 | + } |
| 139 | + |
| 140 | + // Show top 3 for failures |
| 141 | + if (!top1Ok) { |
| 142 | + console.log(` top 3:`); |
| 143 | + top3Files.forEach((f, i) => { |
| 144 | + const short = formatPath(f, redactPaths); |
| 145 | + const score = results[i]?.score?.toFixed(3) || '?'; |
| 146 | + console.log(` ${i + 1}. ${short} (${score})`); |
| 147 | + }); |
| 148 | + } |
| 149 | + } |
| 150 | + |
| 151 | + // Summary |
| 152 | + const total = queries.length; |
| 153 | + console.log(`\n=== RESULTS ===`); |
| 154 | + console.log(`Top-1 Accuracy: ${top1Correct}/${total} (${((top1Correct / total) * 100).toFixed(0)}%)`); |
| 155 | + console.log(`Top-3 Recall: ${top3RecallCount}/${total} (${((top3RecallCount / total) * 100).toFixed(0)}%)`); |
| 156 | + console.log(`Spec Contamination: ${specContaminatedCount}/${total} (${((specContaminatedCount / total) * 100).toFixed(0)}%)`); |
| 157 | + const gateThreshold = Math.ceil(total * 0.7); |
| 158 | + const passesGate = top1Correct >= gateThreshold; |
| 159 | + console.log(`Gate (${gateThreshold}/${total}):${' '.repeat(Math.max(1, 8 - String(gateThreshold).length - String(total).length))}${passesGate ? 'PASS' : 'FAIL'}`); |
| 160 | + console.log(`\n================================\n`); |
| 161 | + |
| 162 | + process.exit(passesGate ? 0 : 1); |
| 163 | +} |
| 164 | + |
| 165 | +main().catch((err) => { |
| 166 | + console.error('Fatal error:', err); |
| 167 | + process.exit(2); |
| 168 | +}); |
0 commit comments