Skip to content

Commit 66824f9

Browse files
authored
Merge pull request #88 from PatrickSys/feat/freeze-discovery-benchmark
feat: freeze discovery benchmark contract
2 parents 1ac4671 + e218859 commit 66824f9

13 files changed

+1793
-54
lines changed

docs/capabilities.md

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -282,12 +282,19 @@ Notes:
282282

283283
Reproducible evaluation is shipped as a CLI entrypoint backed by shared scoring/reporting code.
284284

285-
- **Command:** `npm run eval -- <codebaseA> <codebaseB>` (builds first, then runs `scripts/run-eval.mjs`)
286-
- **Shared implementation:** `src/eval/harness.ts` + `src/eval/types.ts` (tests and CLI use the same scoring)
287-
- **Frozen fixtures:**
288-
- `tests/fixtures/eval-angular-spotify.json` (real-world)
289-
- `tests/fixtures/eval-controlled.json` + `tests/fixtures/codebases/eval-controlled/` (offline controlled)
290-
- **Reported metrics:** Top-1 accuracy, Top-3 recall, spec contamination rate, and a gate pass/fail
285+
- **Command:** `npm run eval -- <codebaseA> [codebaseB] --mode retrieval|discovery [--competitor-results <path>]` (builds first, then runs `scripts/run-eval.mjs`)
286+
- **Shared implementation:** `src/eval/harness.ts`, `src/eval/discovery-harness.ts`, and `src/eval/types.ts`
287+
- **Frozen retrieval fixtures:**
288+
- `tests/fixtures/eval-angular-spotify.json`
289+
- `tests/fixtures/eval-controlled.json` + `tests/fixtures/codebases/eval-controlled/`
290+
- **Frozen discovery fixtures:**
291+
- `tests/fixtures/discovery-angular-spotify.json`
292+
- `tests/fixtures/discovery-excalidraw.json`
293+
- `tests/fixtures/discovery-benchmark-protocol.json`
294+
- **Retrieval metrics:** Top-1 accuracy, Top-3 recall, spec contamination rate, and a gate pass/fail
295+
- **Discovery metrics:** usefulness score, payload bytes, estimated tokens, first relevant hit, and best-example usefulness
296+
- **Discovery gate:** discovery mode evaluates the frozen ship gate only when the full public suite and comparator metrics are available; missing comparator evidence is reported as pending, not silently treated as pass/fail
297+
- **Limits:** discovery mode is discovery-only, uses current shipped surfaces only, and does not claim implementation quality; named competitor runs remain a documented hybrid/manual lane rather than a built-in automated benchmark
291298

292299
## Limitations
293300

scripts/run-eval.mjs

Lines changed: 137 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@ import { analyzerRegistry } from '../dist/core/analyzer-registry.js';
1111
import { AngularAnalyzer } from '../dist/analyzers/angular/index.js';
1212
import { GenericAnalyzer } from '../dist/analyzers/generic/index.js';
1313
import { evaluateFixture, formatEvalReport } from '../dist/eval/harness.js';
14+
import {
15+
combineDiscoverySummaries,
16+
evaluateDiscoveryGate,
17+
evaluateDiscoveryFixture,
18+
formatDiscoveryReport
19+
} from '../dist/eval/discovery-harness.js';
1420

1521
const __dirname = path.dirname(fileURLToPath(import.meta.url));
1622
const projectRoot = path.join(__dirname, '..');
@@ -20,13 +26,34 @@ const packageJson = JSON.parse(readFileSync(packageJsonPath, 'utf-8'));
2026

2127
const defaultFixtureA = path.join(projectRoot, 'tests', 'fixtures', 'eval-angular-spotify.json');
2228
const defaultFixtureB = path.join(projectRoot, 'tests', 'fixtures', 'eval-controlled.json');
29+
const defaultDiscoveryFixtureA = path.join(
30+
projectRoot,
31+
'tests',
32+
'fixtures',
33+
'discovery-angular-spotify.json'
34+
);
35+
const defaultDiscoveryFixtureB = path.join(
36+
projectRoot,
37+
'tests',
38+
'fixtures',
39+
'discovery-excalidraw.json'
40+
);
41+
const defaultDiscoveryProtocol = path.join(
42+
projectRoot,
43+
'tests',
44+
'fixtures',
45+
'discovery-benchmark-protocol.json'
46+
);
2347

2448
const usage = [
2549
`Usage: node scripts/run-eval.mjs <codebaseA> [codebaseB] [options]`,
2650
``,
2751
`Options:`,
52+
` --mode=<retrieval|discovery> Select benchmark mode (default: retrieval)`,
2853
` --fixture-a=<path> Override fixture for codebaseA`,
2954
` --fixture-b=<path> Override fixture for codebaseB`,
55+
` --protocol=<path> Override discovery benchmark protocol`,
56+
` --competitor-results=<path> JSON file with comparator metrics for discovery gate evaluation`,
3057
` --skip-reindex Skip re-index phase`,
3158
` --no-rerank Disable ambiguity reranker`,
3259
` --no-redact Show full file paths in report`,
@@ -87,6 +114,7 @@ async function runSingleEvaluation({
87114
label,
88115
codebasePath,
89116
fixturePath,
117+
mode,
90118
skipReindex,
91119
noRerank,
92120
redactPaths
@@ -98,36 +126,81 @@ async function runSingleEvaluation({
98126
console.log(`\n=== Codebase: ${label} ===`);
99127
console.log(`Target: ${resolvedCodebase}`);
100128
console.log(`Fixture: ${resolvedFixture}`);
101-
console.log(
102-
`Reranker: ${noRerank ? 'DISABLED' : 'enabled (ambiguity-triggered, Xenova/ms-marco-MiniLM-L-6-v2)'}`
103-
);
129+
console.log(`Mode: ${mode}`);
130+
if (mode === 'retrieval') {
131+
console.log(
132+
`Reranker: ${noRerank ? 'DISABLED' : 'enabled (ambiguity-triggered, Xenova/ms-marco-MiniLM-L-6-v2)'}`
133+
);
134+
}
104135
console.log(`Path output: ${redactPaths ? 'REDACTED' : 'FULL'}`);
105136

106137
await maybeReindex(resolvedCodebase, skipReindex);
107138

108-
console.log(`\n--- Phase 2: Running ${fixture.queries.length}-query eval harness ---`);
109-
const searcher = new CodebaseSearcher(resolvedCodebase);
110-
const summary = await evaluateFixture({
111-
fixture,
112-
searcher,
113-
limit: 5,
114-
searchOptions: {
115-
enableReranker: !noRerank
116-
}
117-
});
139+
let summary;
140+
let report;
118141

119-
const report = formatEvalReport({
120-
codebaseLabel: label,
121-
fixturePath: resolvedFixture,
122-
summary,
123-
redactPaths
124-
});
142+
if (mode === 'discovery') {
143+
console.log(`\n--- Phase 2: Running ${fixture.tasks.length}-task discovery harness ---`);
144+
summary = await evaluateDiscoveryFixture({
145+
fixture,
146+
rootPath: resolvedCodebase
147+
});
148+
report = formatDiscoveryReport({
149+
codebaseLabel: label,
150+
fixturePath: resolvedFixture,
151+
summary
152+
});
153+
} else {
154+
console.log(`\n--- Phase 2: Running ${fixture.queries.length}-query eval harness ---`);
155+
const searcher = new CodebaseSearcher(resolvedCodebase);
156+
summary = await evaluateFixture({
157+
fixture,
158+
searcher,
159+
limit: 5,
160+
searchOptions: {
161+
enableReranker: !noRerank
162+
}
163+
});
164+
165+
report = formatEvalReport({
166+
codebaseLabel: label,
167+
fixturePath: resolvedFixture,
168+
summary,
169+
redactPaths
170+
});
171+
}
125172

126173
console.log(report);
127174
return summary;
128175
}
129176

130-
function printCombinedSummary(summaries) {
177+
function printCombinedSummary(summaries, mode) {
178+
if (mode === 'discovery') {
179+
const totalTasks = summaries.reduce((sum, summary) => sum + summary.totalTasks, 0);
180+
const avgUsefulness =
181+
totalTasks > 0
182+
? summaries.reduce((sum, summary) => sum + summary.averageUsefulness * summary.totalTasks, 0) /
183+
totalTasks
184+
: 0;
185+
const avgPayload =
186+
totalTasks > 0
187+
? summaries.reduce((sum, summary) => sum + summary.averagePayloadBytes * summary.totalTasks, 0) /
188+
totalTasks
189+
: 0;
190+
const avgTokens =
191+
totalTasks > 0
192+
? summaries.reduce((sum, summary) => sum + summary.averageEstimatedTokens * summary.totalTasks, 0) /
193+
totalTasks
194+
: 0;
195+
196+
console.log(`\n=== Combined Discovery Summary ===`);
197+
console.log(`Average usefulness: ${(avgUsefulness * 100).toFixed(0)}%`);
198+
console.log(`Average payload: ${Math.round(avgPayload)} bytes`);
199+
console.log(`Average estimated tokens: ${Math.round(avgTokens)}`);
200+
console.log(`=================================\n`);
201+
return;
202+
}
203+
131204
const total = summaries.reduce((sum, summary) => sum + summary.total, 0);
132205
const top1Correct = summaries.reduce((sum, summary) => sum + summary.top1Correct, 0);
133206
const top3RecallCount = summaries.reduce((sum, summary) => sum + summary.top3RecallCount, 0);
@@ -156,8 +229,11 @@ async function main() {
156229
'skip-reindex': { type: 'boolean', default: false },
157230
'no-rerank': { type: 'boolean', default: false },
158231
'no-redact': { type: 'boolean', default: false },
232+
mode: { type: 'string', default: 'retrieval' },
159233
'fixture-a': { type: 'string' },
160-
'fixture-b': { type: 'string' }
234+
'fixture-b': { type: 'string' },
235+
protocol: { type: 'string' },
236+
'competitor-results': { type: 'string' }
161237
},
162238
allowPositionals: true
163239
});
@@ -176,10 +252,26 @@ async function main() {
176252

177253
const codebaseA = positionals[0];
178254
const codebaseB = positionals[1];
179-
const fixtureA = values['fixture-a'] ? path.resolve(values['fixture-a']) : defaultFixtureA;
180-
const fixtureB = values['fixture-b'] ? path.resolve(values['fixture-b']) : defaultFixtureB;
255+
const mode = values.mode === 'discovery' ? 'discovery' : 'retrieval';
256+
const fixtureA = values['fixture-a']
257+
? path.resolve(values['fixture-a'])
258+
: mode === 'discovery'
259+
? defaultDiscoveryFixtureA
260+
: defaultFixtureA;
261+
const fixtureB = values['fixture-b']
262+
? path.resolve(values['fixture-b'])
263+
: mode === 'discovery'
264+
? defaultDiscoveryFixtureB
265+
: defaultFixtureB;
266+
const protocolPath = values.protocol
267+
? path.resolve(values.protocol)
268+
: defaultDiscoveryProtocol;
269+
const comparatorResultsPath = values['competitor-results']
270+
? path.resolve(values['competitor-results'])
271+
: null;
181272

182273
const sharedOptions = {
274+
mode,
183275
skipReindex: values['skip-reindex'],
184276
noRerank: values['no-rerank'],
185277
redactPaths: !values['no-redact']
@@ -193,7 +285,6 @@ async function main() {
193285
});
194286

195287
const summaries = [summaryA];
196-
let passesAllGates = summaryA.passesGate;
197288

198289
if (codebaseB) {
199290
const summaryB = await runSingleEvaluation({
@@ -204,10 +295,30 @@ async function main() {
204295
});
205296

206297
summaries.push(summaryB);
207-
passesAllGates = passesAllGates && summaryB.passesGate;
208-
printCombinedSummary(summaries);
209298
}
210299

300+
if (mode === 'discovery') {
301+
const combinedSummary = combineDiscoverySummaries(summaries);
302+
const protocol = loadFixture(protocolPath);
303+
const comparatorEvidence = comparatorResultsPath ? loadFixture(comparatorResultsPath) : undefined;
304+
const gate = evaluateDiscoveryGate({
305+
summary: combinedSummary,
306+
protocol,
307+
comparatorEvidence,
308+
suiteComplete: summaries.length > 1
309+
});
310+
combinedSummary.gate = gate;
311+
printCombinedSummary([combinedSummary], mode);
312+
console.log(formatDiscoveryReport({
313+
codebaseLabel: 'combined-suite',
314+
fixturePath: protocolPath,
315+
summary: combinedSummary
316+
}));
317+
process.exit(gate.status === 'failed' ? 1 : 0);
318+
}
319+
320+
const passesAllGates = summaries.every((summary) => summary.passesGate);
321+
printCombinedSummary(summaries, mode);
211322
process.exit(passesAllGates ? 0 : 1);
212323
}
213324

0 commit comments

Comments
 (0)