|
| 1 | +import crypto from 'crypto'; |
| 2 | +import type { SearchResult } from '../types/index.js'; |
| 3 | +import type { |
| 4 | + EvalGate, |
| 5 | + EvalQuery, |
| 6 | + EvalResult, |
| 7 | + EvalSummary, |
| 8 | + EvaluateFixtureParams, |
| 9 | + FormatEvalReportParams |
| 10 | +} from './types.js'; |
| 11 | + |
| 12 | +function normalizePath(filePath: string): string { |
| 13 | + return filePath.toLowerCase().replace(/\\/g, '/'); |
| 14 | +} |
| 15 | + |
| 16 | +function isTestFile(filePath: string): boolean { |
| 17 | + const normalized = normalizePath(filePath); |
| 18 | + return ( |
| 19 | + normalized.includes('.spec.') || |
| 20 | + normalized.includes('.test.') || |
| 21 | + normalized.includes('/e2e/') || |
| 22 | + normalized.includes('/__tests__/') |
| 23 | + ); |
| 24 | +} |
| 25 | + |
| 26 | +function matchesPattern(filePath: string, patterns: string[]): boolean { |
| 27 | + const normalized = normalizePath(filePath); |
| 28 | + return patterns.some((pattern) => normalized.includes(pattern.toLowerCase())); |
| 29 | +} |
| 30 | + |
| 31 | +function dedupeByFile(results: SearchResult[]): SearchResult[] { |
| 32 | + const seen = new Set<string>(); |
| 33 | + const deduped: SearchResult[] = []; |
| 34 | + |
| 35 | + for (const result of results) { |
| 36 | + const key = normalizePath(result.filePath); |
| 37 | + if (seen.has(key)) { |
| 38 | + continue; |
| 39 | + } |
| 40 | + |
| 41 | + seen.add(key); |
| 42 | + deduped.push(result); |
| 43 | + } |
| 44 | + |
| 45 | + return deduped; |
| 46 | +} |
| 47 | + |
| 48 | +function getExpectedPatterns(query: EvalQuery): string[] { |
| 49 | + return query.expectedPatterns ?? query.expectedTopFiles ?? []; |
| 50 | +} |
| 51 | + |
| 52 | +function getExpectedNotPatterns(query: EvalQuery): string[] { |
| 53 | + return query.expectedNotPatterns ?? query.expectedNotTopFiles ?? []; |
| 54 | +} |
| 55 | + |
| 56 | +function evaluateQuery(query: EvalQuery, results: SearchResult[]): EvalResult { |
| 57 | + const uniqueResults = dedupeByFile(results); |
| 58 | + const topFile = uniqueResults.length > 0 ? uniqueResults[0].filePath : null; |
| 59 | + const top3Files = uniqueResults.slice(0, 3).map((result) => result.filePath); |
| 60 | + |
| 61 | + const expectedPatterns = getExpectedPatterns(query); |
| 62 | + const expectedNotPatterns = getExpectedNotPatterns(query); |
| 63 | + |
| 64 | + const top1Correct = |
| 65 | + topFile !== null && |
| 66 | + matchesPattern(topFile, expectedPatterns) && |
| 67 | + !matchesPattern(topFile, expectedNotPatterns); |
| 68 | + |
| 69 | + const top3Recall = top3Files.some( |
| 70 | + (filePath) => |
| 71 | + matchesPattern(filePath, expectedPatterns) && !matchesPattern(filePath, expectedNotPatterns) |
| 72 | + ); |
| 73 | + |
| 74 | + const specCount = top3Files.filter((filePath) => isTestFile(filePath)).length; |
| 75 | + const specContaminated = specCount >= 2; |
| 76 | + |
| 77 | + return { |
| 78 | + queryId: query.id, |
| 79 | + query: query.query, |
| 80 | + category: query.category, |
| 81 | + expectedPatterns, |
| 82 | + expectedNotPatterns, |
| 83 | + topFile, |
| 84 | + top3Files, |
| 85 | + top1Correct, |
| 86 | + top3Recall, |
| 87 | + specContaminated, |
| 88 | + score: uniqueResults.length > 0 ? uniqueResults[0].score : 0 |
| 89 | + }; |
| 90 | +} |
| 91 | + |
| 92 | +function resolveGateThreshold(total: number, gate: EvalGate): number { |
| 93 | + if (gate <= 1) { |
| 94 | + return Math.ceil(total * gate); |
| 95 | + } |
| 96 | + |
| 97 | + return Math.ceil(gate); |
| 98 | +} |
| 99 | + |
| 100 | +function hashPath(filePath: string): string { |
| 101 | + return crypto.createHash('sha1').update(normalizePath(filePath)).digest('hex').slice(0, 8); |
| 102 | +} |
| 103 | + |
| 104 | +function formatPath(filePath: string | null, redactPaths: boolean): string { |
| 105 | + if (!filePath) { |
| 106 | + return 'none'; |
| 107 | + } |
| 108 | + |
| 109 | + const normalized = filePath.replace(/\\/g, '/'); |
| 110 | + if (!redactPaths) { |
| 111 | + return normalized; |
| 112 | + } |
| 113 | + |
| 114 | + const base = normalized.split('/').pop() || normalized; |
| 115 | + return `path#${hashPath(normalized)}/${base}`; |
| 116 | +} |
| 117 | + |
| 118 | +export async function evaluateFixture({ |
| 119 | + fixture, |
| 120 | + searcher, |
| 121 | + limit = 5, |
| 122 | + searchOptions |
| 123 | +}: EvaluateFixtureParams): Promise<EvalSummary> { |
| 124 | + const results: EvalResult[] = []; |
| 125 | + |
| 126 | + for (const query of fixture.queries) { |
| 127 | + const searchResults = await searcher.search(query.query, limit, undefined, searchOptions); |
| 128 | + results.push(evaluateQuery(query, searchResults)); |
| 129 | + } |
| 130 | + |
| 131 | + return summarizeEvaluation(results); |
| 132 | +} |
| 133 | + |
| 134 | +export function summarizeEvaluation(results: EvalResult[], gate: EvalGate = 0.7): EvalSummary { |
| 135 | + const total = results.length; |
| 136 | + const top1Correct = results.filter((result) => result.top1Correct).length; |
| 137 | + const top3RecallCount = results.filter((result) => result.top3Recall).length; |
| 138 | + const specContaminatedCount = results.filter((result) => result.specContaminated).length; |
| 139 | + const avgTopScore = |
| 140 | + total > 0 ? results.reduce((sum, result) => sum + result.score, 0) / total : 0; |
| 141 | + const gateThreshold = resolveGateThreshold(total, gate); |
| 142 | + |
| 143 | + return { |
| 144 | + total, |
| 145 | + top1Correct, |
| 146 | + top1Accuracy: total > 0 ? top1Correct / total : 0, |
| 147 | + top3RecallCount, |
| 148 | + top3Recall: total > 0 ? top3RecallCount / total : 0, |
| 149 | + specContaminatedCount, |
| 150 | + specContaminationRate: total > 0 ? specContaminatedCount / total : 0, |
| 151 | + avgTopScore, |
| 152 | + gateThreshold, |
| 153 | + passesGate: total > 0 && top1Correct >= gateThreshold, |
| 154 | + results |
| 155 | + }; |
| 156 | +} |
| 157 | + |
| 158 | +export function formatEvalReport({ |
| 159 | + codebaseLabel, |
| 160 | + fixturePath, |
| 161 | + summary, |
| 162 | + redactPaths = true |
| 163 | +}: FormatEvalReportParams): string { |
| 164 | + const lines: string[] = []; |
| 165 | + const wins = summary.results.filter((result) => result.top1Correct); |
| 166 | + const failures = summary.results.filter((result) => !result.top1Correct); |
| 167 | + |
| 168 | + lines.push(`\n=== Eval Report: ${codebaseLabel} ===`); |
| 169 | + lines.push(`Fixture: ${fixturePath}`); |
| 170 | + lines.push( |
| 171 | + `Top-1 Accuracy: ${summary.top1Correct}/${summary.total} (${(summary.top1Accuracy * 100).toFixed(0)}%)` |
| 172 | + ); |
| 173 | + lines.push( |
| 174 | + `Top-3 Recall: ${summary.top3RecallCount}/${summary.total} (${(summary.top3Recall * 100).toFixed(0)}%)` |
| 175 | + ); |
| 176 | + lines.push( |
| 177 | + `Spec Contamination: ${summary.specContaminatedCount}/${summary.total} (${(summary.specContaminationRate * 100).toFixed(0)}%)` |
| 178 | + ); |
| 179 | + lines.push( |
| 180 | + `Gate (${summary.gateThreshold}/${summary.total}): ${summary.passesGate ? 'PASS' : 'FAIL'}` |
| 181 | + ); |
| 182 | + lines.push(`Wins: ${wins.length} | Failures: ${failures.length}`); |
| 183 | + |
| 184 | + lines.push('\nWins:'); |
| 185 | + if (wins.length === 0) { |
| 186 | + lines.push(' (none)'); |
| 187 | + } else { |
| 188 | + for (const result of wins) { |
| 189 | + lines.push( |
| 190 | + ` PASS #${result.queryId} [${result.category}] "${result.query}" -> ${formatPath(result.topFile, redactPaths)} (${result.score.toFixed(3)})` |
| 191 | + ); |
| 192 | + } |
| 193 | + } |
| 194 | + |
| 195 | + lines.push('\nFailures:'); |
| 196 | + if (failures.length === 0) { |
| 197 | + lines.push(' (none)'); |
| 198 | + } else { |
| 199 | + for (const result of failures) { |
| 200 | + lines.push( |
| 201 | + ` FAIL #${result.queryId} [${result.category}] "${result.query}" -> ${formatPath(result.topFile, redactPaths)} (${result.score.toFixed(3)})` |
| 202 | + ); |
| 203 | + lines.push(` expected: ${result.expectedPatterns.join(' | ') || '(none)'}`); |
| 204 | + lines.push(` expected-not: ${result.expectedNotPatterns.join(' | ') || '(none)'}`); |
| 205 | + lines.push(' top-3 actual:'); |
| 206 | + if (result.top3Files.length === 0) { |
| 207 | + lines.push(' 1. none'); |
| 208 | + } else { |
| 209 | + for (let index = 0; index < result.top3Files.length; index++) { |
| 210 | + lines.push(` ${index + 1}. ${formatPath(result.top3Files[index], redactPaths)}`); |
| 211 | + } |
| 212 | + } |
| 213 | + } |
| 214 | + } |
| 215 | + |
| 216 | + lines.push('\n================================'); |
| 217 | + return lines.join('\n'); |
| 218 | +} |
0 commit comments