@@ -11,6 +11,12 @@ import { analyzerRegistry } from '../dist/core/analyzer-registry.js';
1111import { AngularAnalyzer } from '../dist/analyzers/angular/index.js' ;
1212import { GenericAnalyzer } from '../dist/analyzers/generic/index.js' ;
1313import { evaluateFixture , formatEvalReport } from '../dist/eval/harness.js' ;
14+ import {
15+ combineDiscoverySummaries ,
16+ evaluateDiscoveryGate ,
17+ evaluateDiscoveryFixture ,
18+ formatDiscoveryReport
19+ } from '../dist/eval/discovery-harness.js' ;
1420
1521const __dirname = path . dirname ( fileURLToPath ( import . meta. url ) ) ;
1622const projectRoot = path . join ( __dirname , '..' ) ;
@@ -20,13 +26,34 @@ const packageJson = JSON.parse(readFileSync(packageJsonPath, 'utf-8'));
2026
2127const defaultFixtureA = path . join ( projectRoot , 'tests' , 'fixtures' , 'eval-angular-spotify.json' ) ;
2228const defaultFixtureB = path . join ( projectRoot , 'tests' , 'fixtures' , 'eval-controlled.json' ) ;
29+ const defaultDiscoveryFixtureA = path . join (
30+ projectRoot ,
31+ 'tests' ,
32+ 'fixtures' ,
33+ 'discovery-angular-spotify.json'
34+ ) ;
35+ const defaultDiscoveryFixtureB = path . join (
36+ projectRoot ,
37+ 'tests' ,
38+ 'fixtures' ,
39+ 'discovery-excalidraw.json'
40+ ) ;
41+ const defaultDiscoveryProtocol = path . join (
42+ projectRoot ,
43+ 'tests' ,
44+ 'fixtures' ,
45+ 'discovery-benchmark-protocol.json'
46+ ) ;
2347
2448const usage = [
2549 `Usage: node scripts/run-eval.mjs <codebaseA> [codebaseB] [options]` ,
2650 `` ,
2751 `Options:` ,
52+ ` --mode=<retrieval|discovery> Select benchmark mode (default: retrieval)` ,
2853 ` --fixture-a=<path> Override fixture for codebaseA` ,
2954 ` --fixture-b=<path> Override fixture for codebaseB` ,
55+ ` --protocol=<path> Override discovery benchmark protocol` ,
56+ ` --competitor-results=<path> JSON file with comparator metrics for discovery gate evaluation` ,
3057 ` --skip-reindex Skip re-index phase` ,
3158 ` --no-rerank Disable ambiguity reranker` ,
3259 ` --no-redact Show full file paths in report` ,
@@ -87,6 +114,7 @@ async function runSingleEvaluation({
87114 label,
88115 codebasePath,
89116 fixturePath,
117+ mode,
90118 skipReindex,
91119 noRerank,
92120 redactPaths
@@ -98,36 +126,81 @@ async function runSingleEvaluation({
98126 console . log ( `\n=== Codebase: ${ label } ===` ) ;
99127 console . log ( `Target: ${ resolvedCodebase } ` ) ;
100128 console . log ( `Fixture: ${ resolvedFixture } ` ) ;
101- console . log (
102- `Reranker: ${ noRerank ? 'DISABLED' : 'enabled (ambiguity-triggered, Xenova/ms-marco-MiniLM-L-6-v2)' } `
103- ) ;
129+ console . log ( `Mode: ${ mode } ` ) ;
130+ if ( mode === 'retrieval' ) {
131+ console . log (
132+ `Reranker: ${ noRerank ? 'DISABLED' : 'enabled (ambiguity-triggered, Xenova/ms-marco-MiniLM-L-6-v2)' } `
133+ ) ;
134+ }
104135 console . log ( `Path output: ${ redactPaths ? 'REDACTED' : 'FULL' } ` ) ;
105136
106137 await maybeReindex ( resolvedCodebase , skipReindex ) ;
107138
108- console . log ( `\n--- Phase 2: Running ${ fixture . queries . length } -query eval harness ---` ) ;
109- const searcher = new CodebaseSearcher ( resolvedCodebase ) ;
110- const summary = await evaluateFixture ( {
111- fixture,
112- searcher,
113- limit : 5 ,
114- searchOptions : {
115- enableReranker : ! noRerank
116- }
117- } ) ;
139+ let summary ;
140+ let report ;
118141
119- const report = formatEvalReport ( {
120- codebaseLabel : label ,
121- fixturePath : resolvedFixture ,
122- summary,
123- redactPaths
124- } ) ;
142+ if ( mode === 'discovery' ) {
143+ console . log ( `\n--- Phase 2: Running ${ fixture . tasks . length } -task discovery harness ---` ) ;
144+ summary = await evaluateDiscoveryFixture ( {
145+ fixture,
146+ rootPath : resolvedCodebase
147+ } ) ;
148+ report = formatDiscoveryReport ( {
149+ codebaseLabel : label ,
150+ fixturePath : resolvedFixture ,
151+ summary
152+ } ) ;
153+ } else {
154+ console . log ( `\n--- Phase 2: Running ${ fixture . queries . length } -query eval harness ---` ) ;
155+ const searcher = new CodebaseSearcher ( resolvedCodebase ) ;
156+ summary = await evaluateFixture ( {
157+ fixture,
158+ searcher,
159+ limit : 5 ,
160+ searchOptions : {
161+ enableReranker : ! noRerank
162+ }
163+ } ) ;
164+
165+ report = formatEvalReport ( {
166+ codebaseLabel : label ,
167+ fixturePath : resolvedFixture ,
168+ summary,
169+ redactPaths
170+ } ) ;
171+ }
125172
126173 console . log ( report ) ;
127174 return summary ;
128175}
129176
130- function printCombinedSummary ( summaries ) {
177+ function printCombinedSummary ( summaries , mode ) {
178+ if ( mode === 'discovery' ) {
179+ const totalTasks = summaries . reduce ( ( sum , summary ) => sum + summary . totalTasks , 0 ) ;
180+ const avgUsefulness =
181+ totalTasks > 0
182+ ? summaries . reduce ( ( sum , summary ) => sum + summary . averageUsefulness * summary . totalTasks , 0 ) /
183+ totalTasks
184+ : 0 ;
185+ const avgPayload =
186+ totalTasks > 0
187+ ? summaries . reduce ( ( sum , summary ) => sum + summary . averagePayloadBytes * summary . totalTasks , 0 ) /
188+ totalTasks
189+ : 0 ;
190+ const avgTokens =
191+ totalTasks > 0
192+ ? summaries . reduce ( ( sum , summary ) => sum + summary . averageEstimatedTokens * summary . totalTasks , 0 ) /
193+ totalTasks
194+ : 0 ;
195+
196+ console . log ( `\n=== Combined Discovery Summary ===` ) ;
197+ console . log ( `Average usefulness: ${ ( avgUsefulness * 100 ) . toFixed ( 0 ) } %` ) ;
198+ console . log ( `Average payload: ${ Math . round ( avgPayload ) } bytes` ) ;
199+ console . log ( `Average estimated tokens: ${ Math . round ( avgTokens ) } ` ) ;
200+ console . log ( `=================================\n` ) ;
201+ return ;
202+ }
203+
131204 const total = summaries . reduce ( ( sum , summary ) => sum + summary . total , 0 ) ;
132205 const top1Correct = summaries . reduce ( ( sum , summary ) => sum + summary . top1Correct , 0 ) ;
133206 const top3RecallCount = summaries . reduce ( ( sum , summary ) => sum + summary . top3RecallCount , 0 ) ;
@@ -156,8 +229,11 @@ async function main() {
156229 'skip-reindex' : { type : 'boolean' , default : false } ,
157230 'no-rerank' : { type : 'boolean' , default : false } ,
158231 'no-redact' : { type : 'boolean' , default : false } ,
232+ mode : { type : 'string' , default : 'retrieval' } ,
159233 'fixture-a' : { type : 'string' } ,
160- 'fixture-b' : { type : 'string' }
234+ 'fixture-b' : { type : 'string' } ,
235+ protocol : { type : 'string' } ,
236+ 'competitor-results' : { type : 'string' }
161237 } ,
162238 allowPositionals : true
163239 } ) ;
@@ -176,10 +252,26 @@ async function main() {
176252
177253 const codebaseA = positionals [ 0 ] ;
178254 const codebaseB = positionals [ 1 ] ;
179- const fixtureA = values [ 'fixture-a' ] ? path . resolve ( values [ 'fixture-a' ] ) : defaultFixtureA ;
180- const fixtureB = values [ 'fixture-b' ] ? path . resolve ( values [ 'fixture-b' ] ) : defaultFixtureB ;
255+ const mode = values . mode === 'discovery' ? 'discovery' : 'retrieval' ;
256+ const fixtureA = values [ 'fixture-a' ]
257+ ? path . resolve ( values [ 'fixture-a' ] )
258+ : mode === 'discovery'
259+ ? defaultDiscoveryFixtureA
260+ : defaultFixtureA ;
261+ const fixtureB = values [ 'fixture-b' ]
262+ ? path . resolve ( values [ 'fixture-b' ] )
263+ : mode === 'discovery'
264+ ? defaultDiscoveryFixtureB
265+ : defaultFixtureB ;
266+ const protocolPath = values . protocol
267+ ? path . resolve ( values . protocol )
268+ : defaultDiscoveryProtocol ;
269+ const comparatorResultsPath = values [ 'competitor-results' ]
270+ ? path . resolve ( values [ 'competitor-results' ] )
271+ : null ;
181272
182273 const sharedOptions = {
274+ mode,
183275 skipReindex : values [ 'skip-reindex' ] ,
184276 noRerank : values [ 'no-rerank' ] ,
185277 redactPaths : ! values [ 'no-redact' ]
@@ -204,10 +296,31 @@ async function main() {
204296 } ) ;
205297
206298 summaries . push ( summaryB ) ;
207- passesAllGates = passesAllGates && summaryB . passesGate ;
208- printCombinedSummary ( summaries ) ;
299+ passesAllGates =
300+ mode === 'discovery' ? passesAllGates : passesAllGates && summaryB . passesGate ;
301+ }
302+
303+ if ( mode === 'discovery' ) {
304+ const combinedSummary = combineDiscoverySummaries ( summaries ) ;
305+ const protocol = loadFixture ( protocolPath ) ;
306+ const comparatorEvidence = comparatorResultsPath ? loadFixture ( comparatorResultsPath ) : undefined ;
307+ const gate = evaluateDiscoveryGate ( {
308+ summary : combinedSummary ,
309+ protocol,
310+ comparatorEvidence,
311+ suiteComplete : summaries . length > 1
312+ } ) ;
313+ combinedSummary . gate = gate ;
314+ printCombinedSummary ( [ combinedSummary ] , mode ) ;
315+ console . log ( formatDiscoveryReport ( {
316+ codebaseLabel : 'combined-suite' ,
317+ fixturePath : protocolPath ,
318+ summary : combinedSummary
319+ } ) ) ;
320+ process . exit ( gate . status === 'failed' ? 1 : 0 ) ;
209321 }
210322
323+ printCombinedSummary ( summaries , mode ) ;
211324 process . exit ( passesAllGates ? 0 : 1 ) ;
212325}
213326
0 commit comments