Skip to content

Commit 5c5319b

Browse files
committed
feat(03-03): centralize eval harness scoring logic
- add reusable src/eval harness and shared types for fixture scoring - migrate eval harness tests to import shared module and validate both fixtures
1 parent 4c68d21 commit 5c5319b

File tree

3 files changed

+455
-275
lines changed

3 files changed

+455
-275
lines changed

src/eval/harness.ts

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
import crypto from 'crypto';
2+
import type { SearchResult } from '../types/index.js';
3+
import type {
4+
EvalGate,
5+
EvalQuery,
6+
EvalResult,
7+
EvalSummary,
8+
EvaluateFixtureParams,
9+
FormatEvalReportParams
10+
} from './types.js';
11+
12+
function normalizePath(filePath: string): string {
13+
return filePath.toLowerCase().replace(/\\/g, '/');
14+
}
15+
16+
function isTestFile(filePath: string): boolean {
17+
const normalized = normalizePath(filePath);
18+
return (
19+
normalized.includes('.spec.') ||
20+
normalized.includes('.test.') ||
21+
normalized.includes('/e2e/') ||
22+
normalized.includes('/__tests__/')
23+
);
24+
}
25+
26+
function matchesPattern(filePath: string, patterns: string[]): boolean {
27+
const normalized = normalizePath(filePath);
28+
return patterns.some((pattern) => normalized.includes(pattern.toLowerCase()));
29+
}
30+
31+
function dedupeByFile(results: SearchResult[]): SearchResult[] {
32+
const seen = new Set<string>();
33+
const deduped: SearchResult[] = [];
34+
35+
for (const result of results) {
36+
const key = normalizePath(result.filePath);
37+
if (seen.has(key)) {
38+
continue;
39+
}
40+
41+
seen.add(key);
42+
deduped.push(result);
43+
}
44+
45+
return deduped;
46+
}
47+
48+
function getExpectedPatterns(query: EvalQuery): string[] {
49+
return query.expectedPatterns ?? query.expectedTopFiles ?? [];
50+
}
51+
52+
function getExpectedNotPatterns(query: EvalQuery): string[] {
53+
return query.expectedNotPatterns ?? query.expectedNotTopFiles ?? [];
54+
}
55+
56+
function evaluateQuery(query: EvalQuery, results: SearchResult[]): EvalResult {
57+
const uniqueResults = dedupeByFile(results);
58+
const topFile = uniqueResults.length > 0 ? uniqueResults[0].filePath : null;
59+
const top3Files = uniqueResults.slice(0, 3).map((result) => result.filePath);
60+
61+
const expectedPatterns = getExpectedPatterns(query);
62+
const expectedNotPatterns = getExpectedNotPatterns(query);
63+
64+
const top1Correct =
65+
topFile !== null &&
66+
matchesPattern(topFile, expectedPatterns) &&
67+
!matchesPattern(topFile, expectedNotPatterns);
68+
69+
const top3Recall = top3Files.some(
70+
(filePath) =>
71+
matchesPattern(filePath, expectedPatterns) && !matchesPattern(filePath, expectedNotPatterns)
72+
);
73+
74+
const specCount = top3Files.filter((filePath) => isTestFile(filePath)).length;
75+
const specContaminated = specCount >= 2;
76+
77+
return {
78+
queryId: query.id,
79+
query: query.query,
80+
category: query.category,
81+
expectedPatterns,
82+
expectedNotPatterns,
83+
topFile,
84+
top3Files,
85+
top1Correct,
86+
top3Recall,
87+
specContaminated,
88+
score: uniqueResults.length > 0 ? uniqueResults[0].score : 0
89+
};
90+
}
91+
92+
function resolveGateThreshold(total: number, gate: EvalGate): number {
93+
if (gate <= 1) {
94+
return Math.ceil(total * gate);
95+
}
96+
97+
return Math.ceil(gate);
98+
}
99+
100+
function hashPath(filePath: string): string {
101+
return crypto.createHash('sha1').update(normalizePath(filePath)).digest('hex').slice(0, 8);
102+
}
103+
104+
function formatPath(filePath: string | null, redactPaths: boolean): string {
105+
if (!filePath) {
106+
return 'none';
107+
}
108+
109+
const normalized = filePath.replace(/\\/g, '/');
110+
if (!redactPaths) {
111+
return normalized;
112+
}
113+
114+
const base = normalized.split('/').pop() || normalized;
115+
return `path#${hashPath(normalized)}/${base}`;
116+
}
117+
118+
export async function evaluateFixture({
119+
fixture,
120+
searcher,
121+
limit = 5,
122+
searchOptions
123+
}: EvaluateFixtureParams): Promise<EvalSummary> {
124+
const results: EvalResult[] = [];
125+
126+
for (const query of fixture.queries) {
127+
const searchResults = await searcher.search(query.query, limit, undefined, searchOptions);
128+
results.push(evaluateQuery(query, searchResults));
129+
}
130+
131+
return summarizeEvaluation(results);
132+
}
133+
134+
export function summarizeEvaluation(results: EvalResult[], gate: EvalGate = 0.7): EvalSummary {
135+
const total = results.length;
136+
const top1Correct = results.filter((result) => result.top1Correct).length;
137+
const top3RecallCount = results.filter((result) => result.top3Recall).length;
138+
const specContaminatedCount = results.filter((result) => result.specContaminated).length;
139+
const avgTopScore =
140+
total > 0 ? results.reduce((sum, result) => sum + result.score, 0) / total : 0;
141+
const gateThreshold = resolveGateThreshold(total, gate);
142+
143+
return {
144+
total,
145+
top1Correct,
146+
top1Accuracy: total > 0 ? top1Correct / total : 0,
147+
top3RecallCount,
148+
top3Recall: total > 0 ? top3RecallCount / total : 0,
149+
specContaminatedCount,
150+
specContaminationRate: total > 0 ? specContaminatedCount / total : 0,
151+
avgTopScore,
152+
gateThreshold,
153+
passesGate: total > 0 && top1Correct >= gateThreshold,
154+
results
155+
};
156+
}
157+
158+
export function formatEvalReport({
159+
codebaseLabel,
160+
fixturePath,
161+
summary,
162+
redactPaths = true
163+
}: FormatEvalReportParams): string {
164+
const lines: string[] = [];
165+
const wins = summary.results.filter((result) => result.top1Correct);
166+
const failures = summary.results.filter((result) => !result.top1Correct);
167+
168+
lines.push(`\n=== Eval Report: ${codebaseLabel} ===`);
169+
lines.push(`Fixture: ${fixturePath}`);
170+
lines.push(
171+
`Top-1 Accuracy: ${summary.top1Correct}/${summary.total} (${(summary.top1Accuracy * 100).toFixed(0)}%)`
172+
);
173+
lines.push(
174+
`Top-3 Recall: ${summary.top3RecallCount}/${summary.total} (${(summary.top3Recall * 100).toFixed(0)}%)`
175+
);
176+
lines.push(
177+
`Spec Contamination: ${summary.specContaminatedCount}/${summary.total} (${(summary.specContaminationRate * 100).toFixed(0)}%)`
178+
);
179+
lines.push(
180+
`Gate (${summary.gateThreshold}/${summary.total}): ${summary.passesGate ? 'PASS' : 'FAIL'}`
181+
);
182+
lines.push(`Wins: ${wins.length} | Failures: ${failures.length}`);
183+
184+
lines.push('\nWins:');
185+
if (wins.length === 0) {
186+
lines.push(' (none)');
187+
} else {
188+
for (const result of wins) {
189+
lines.push(
190+
` PASS #${result.queryId} [${result.category}] "${result.query}" -> ${formatPath(result.topFile, redactPaths)} (${result.score.toFixed(3)})`
191+
);
192+
}
193+
}
194+
195+
lines.push('\nFailures:');
196+
if (failures.length === 0) {
197+
lines.push(' (none)');
198+
} else {
199+
for (const result of failures) {
200+
lines.push(
201+
` FAIL #${result.queryId} [${result.category}] "${result.query}" -> ${formatPath(result.topFile, redactPaths)} (${result.score.toFixed(3)})`
202+
);
203+
lines.push(` expected: ${result.expectedPatterns.join(' | ') || '(none)'}`);
204+
lines.push(` expected-not: ${result.expectedNotPatterns.join(' | ') || '(none)'}`);
205+
lines.push(' top-3 actual:');
206+
if (result.top3Files.length === 0) {
207+
lines.push(' 1. none');
208+
} else {
209+
for (let index = 0; index < result.top3Files.length; index++) {
210+
lines.push(` ${index + 1}. ${formatPath(result.top3Files[index], redactPaths)}`);
211+
}
212+
}
213+
}
214+
}
215+
216+
lines.push('\n================================');
217+
return lines.join('\n');
218+
}

src/eval/types.ts

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import type { SearchOptions, CodebaseSearcher } from '../core/search.js';
2+
3+
export interface EvalQuery {
4+
id: number;
5+
query: string;
6+
category: string;
7+
expectedPatterns?: string[];
8+
expectedNotPatterns?: string[];
9+
expectedTopFiles?: string[];
10+
expectedNotTopFiles?: string[];
11+
notes?: string;
12+
}
13+
14+
export interface EvalFixture {
15+
description?: string;
16+
codebase?: string;
17+
repository?: string;
18+
frozenDate?: string;
19+
notes?: string;
20+
queries: EvalQuery[];
21+
}
22+
23+
export interface EvalResult {
24+
queryId: number;
25+
query: string;
26+
category: string;
27+
expectedPatterns: string[];
28+
expectedNotPatterns: string[];
29+
topFile: string | null;
30+
top3Files: string[];
31+
top1Correct: boolean;
32+
top3Recall: boolean;
33+
specContaminated: boolean;
34+
score: number;
35+
}
36+
37+
export interface EvalSummary {
38+
total: number;
39+
top1Correct: number;
40+
top1Accuracy: number;
41+
top3RecallCount: number;
42+
top3Recall: number;
43+
specContaminatedCount: number;
44+
specContaminationRate: number;
45+
avgTopScore: number;
46+
gateThreshold: number;
47+
passesGate: boolean;
48+
results: EvalResult[];
49+
}
50+
51+
export interface EvaluateFixtureParams {
52+
fixture: EvalFixture;
53+
searcher: CodebaseSearcher;
54+
limit?: number;
55+
searchOptions?: SearchOptions;
56+
}
57+
58+
export type EvalGate = number;
59+
60+
export interface FormatEvalReportParams {
61+
codebaseLabel: string;
62+
fixturePath: string;
63+
summary: EvalSummary;
64+
redactPaths?: boolean;
65+
}

0 commit comments

Comments
 (0)