Skip to content

Commit 21c3e43

Browse files
committed
fix: align discovery protocol metrics
1 parent b4ab479 commit 21c3e43

File tree

4 files changed

+13
-7
lines changed

4 files changed

+13
-7
lines changed

scripts/run-eval.mjs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,6 @@ async function main() {
285285
});
286286

287287
const summaries = [summaryA];
288-
let passesAllGates = summaryA.passesGate;
289288

290289
if (codebaseB) {
291290
const summaryB = await runSingleEvaluation({
@@ -296,8 +295,6 @@ async function main() {
296295
});
297296

298297
summaries.push(summaryB);
299-
passesAllGates =
300-
mode === 'discovery' ? passesAllGates : passesAllGates && summaryB.passesGate;
301298
}
302299

303300
if (mode === 'discovery') {
@@ -320,6 +317,7 @@ async function main() {
320317
process.exit(gate.status === 'failed' ? 1 : 0);
321318
}
322319

320+
const passesAllGates = summaries.every((summary) => summary.passesGate);
323321
printCombinedSummary(summaries, mode);
324322
process.exit(passesAllGates ? 0 : 1);
325323
}

src/eval/discovery-harness.ts

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,10 @@ function compareMetric(
240240
comparatorValue: number | null,
241241
metric: DiscoveryMetricName
242242
): DiscoveryMetricComparison {
243-
const lowerIsBetter = metric === 'averageEstimatedTokens' || metric === 'averageFirstRelevantHit';
243+
const lowerIsBetter =
244+
metric === 'averagePayloadBytes' ||
245+
metric === 'averageEstimatedTokens' ||
246+
metric === 'averageFirstRelevantHit';
244247
const passes =
245248
actualValue !== null &&
246249
comparatorValue !== null &&
@@ -260,7 +263,10 @@ function compareMetricWithinTolerance(
260263
metric: DiscoveryMetricName,
261264
tolerancePercent: number
262265
): DiscoveryMetricComparison {
263-
const lowerIsBetter = metric === 'averageFirstRelevantHit';
266+
const lowerIsBetter =
267+
metric === 'averagePayloadBytes' ||
268+
metric === 'averageEstimatedTokens' ||
269+
metric === 'averageFirstRelevantHit';
264270
const multiplier = 1 + tolerancePercent / 100;
265271
const passes =
266272
actualValue !== null &&

src/eval/types.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ export interface DiscoverySurfaceResult {
151151

152152
export type DiscoveryMetricName =
153153
| 'averageUsefulness'
154+
| 'averagePayloadBytes'
154155
| 'averageEstimatedTokens'
155156
| 'averageFirstRelevantHit'
156157
| 'bestExampleUsefulnessRate';
@@ -198,6 +199,7 @@ export interface DiscoveryBenchmarkProtocol {
198199

199200
export interface DiscoveryComparatorMetrics {
200201
averageUsefulness?: number | null;
202+
averagePayloadBytes?: number | null;
201203
averageEstimatedTokens?: number | null;
202204
averageFirstRelevantHit?: number | null;
203205
bestExampleUsefulnessRate?: number | null;

tests/fixtures/discovery-benchmark-protocol.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@
3939
}
4040
],
4141
"metrics": {
42-
"payloadCost": ["payloadBytes", "estimatedTokens"],
43-
"usefulness": ["usefulnessScore", "firstRelevantHit", "bestExampleUseful"]
42+
"payloadCost": ["averagePayloadBytes", "averageEstimatedTokens"],
43+
"usefulness": ["averageUsefulness", "averageFirstRelevantHit", "bestExampleUsefulnessRate"]
4444
},
4545
"fairnessRules": [
4646
"Use only current shipped codebase-context surfaces in the direct-tool lane.",

0 commit comments

Comments
 (0)