44 * Usage: node --import tsx src/run-benchmark.ts [dataset-path]
55 */
66
7- import { resolve , dirname } from 'node:path' ;
7+ import { resolve , dirname , relative } from 'node:path' ;
88import { writeFile , mkdir } from 'node:fs/promises' ;
99import { fileURLToPath } from 'node:url' ;
1010import { runBenchmark , generateMarkdownReport } from './benchmark.js' ;
1111import { runGrepSearch } from './runners/grep-runner.js' ;
12+ import { runCodeRAGSearch } from './runners/coderag-runner.js' ;
13+ import {
14+ loadConfig ,
15+ LanceDBStore ,
16+ BM25Index ,
17+ HybridSearch ,
18+ OllamaEmbeddingProvider ,
19+ } from '@code-rag/core' ;
1220
1321const __dirname = dirname ( fileURLToPath ( import . meta. url ) ) ;
1422
23+ /**
24+ * Extract keywords from a natural language query for grep.
25+ * Removes stop words and short words to get meaningful search terms.
26+ */
27+ function extractKeywords ( query : string ) : string {
28+ const stopWords = new Set ( [
29+ 'a' , 'an' , 'the' , 'is' , 'are' , 'was' , 'were' , 'be' , 'been' , 'being' ,
30+ 'have' , 'has' , 'had' , 'do' , 'does' , 'did' , 'will' , 'would' , 'could' ,
31+ 'should' , 'may' , 'might' , 'can' , 'shall' , 'must' ,
32+ 'in' , 'on' , 'at' , 'to' , 'for' , 'of' , 'with' , 'by' , 'from' , 'as' ,
33+ 'into' , 'through' , 'during' , 'before' , 'after' , 'above' , 'below' ,
34+ 'and' , 'but' , 'or' , 'nor' , 'not' , 'so' , 'yet' ,
35+ 'it' , 'its' , 'this' , 'that' , 'these' , 'those' ,
36+ 'i' , 'me' , 'my' , 'we' , 'our' , 'you' , 'your' , 'he' , 'she' , 'they' ,
37+ 'what' , 'which' , 'who' , 'whom' , 'when' , 'where' , 'why' , 'how' ,
38+ 'find' , 'show' , 'get' , 'list' , 'display' , 'where' , 'defined' ,
39+ 'work' , 'works' , 'working' , 'used' , 'using' , 'use' ,
40+ 'does' , 'happen' , 'happens' , 'between' , 'each' , 'other' ,
41+ ] ) ;
42+
43+ const words = query
44+ . replace ( / [ ? . , ! ] / g, '' )
45+ . split ( / \s + / )
46+ . filter ( ( w ) => w . length > 2 && ! stopWords . has ( w . toLowerCase ( ) ) ) ;
47+
48+ // If we have PascalCase/camelCase identifiers, prefer those
49+ const identifiers = words . filter ( ( w ) => / [ A - Z ] / . test ( w ) ) ;
50+ if ( identifiers . length > 0 ) {
51+ return identifiers . join ( '\\|' ) ;
52+ }
53+
54+ // Otherwise join top keywords with grep OR
55+ return words . slice ( 0 , 3 ) . join ( '\\|' ) ;
56+ }
57+
1558async function main ( ) : Promise < void > {
1659 const datasetPath =
1760 process . argv [ 2 ] ?? resolve ( __dirname , '../datasets/coderag-queries.json' ) ;
@@ -21,17 +64,67 @@ async function main(): Promise<void> {
2164 console . log ( `Root directory: ${ rootDir } ` ) ;
2265 console . log ( '' ) ;
2366
24- // Run grep baseline
67+ /** Normalize absolute paths to relative (from rootDir), filter out worktrees. */
68+ function normalizePaths ( filePaths : string [ ] ) : string [ ] {
69+ return filePaths
70+ . filter ( ( p ) => ! p . includes ( '.claude/worktrees' ) )
71+ . map ( ( p ) => ( p . startsWith ( '/' ) ? relative ( rootDir , p ) : p ) ) ;
72+ }
73+
74+ // Run grep baseline with keyword extraction
2575 console . log ( 'Running grep baseline...' ) ;
2676 const grepReport = await runBenchmark ( datasetPath , 'grep' , async ( query ) => {
27- return runGrepSearch ( query , rootDir ) ;
77+ const keywords = extractKeywords ( query ) ;
78+ const result = await runGrepSearch ( keywords , rootDir ) ;
79+ return { filePaths : normalizePaths ( result . filePaths ) , durationMs : result . durationMs } ;
2880 } ) ;
29-
3081 console . log ( `Grep baseline completed: ${ grepReport . totalQueries } queries` ) ;
3182 console . log ( '' ) ;
3283
84+ // Run CodeRAG hybrid search
85+ console . log ( 'Running CodeRAG hybrid search...' ) ;
86+ const configResult = await loadConfig ( rootDir ) ;
87+ if ( configResult . isErr ( ) ) {
88+ console . error ( 'Failed to load config:' , configResult . error ) ;
89+ process . exit ( 1 ) ;
90+ }
91+
92+ const config = configResult . value ;
93+ const embeddingProvider = new OllamaEmbeddingProvider ( {
94+ model : config . embedding ?. model ?? 'nomic-embed-text' ,
95+ baseUrl : config . embedding ?. ollamaUrl ?? 'http://localhost:11434' ,
96+ } ) ;
97+ const storageDir = resolve ( rootDir , config . storage ?. path ?? '.coderag' ) ;
98+ const dimensions = config . embedding ?. dimensions ?? 768 ;
99+ const vectorStore = new LanceDBStore ( resolve ( storageDir , 'lancedb' ) , dimensions ) ;
100+ // Load BM25 from disk
101+ const { readFile } = await import ( 'node:fs/promises' ) ;
102+ const bm25Path = resolve ( storageDir , 'bm25-index.json' ) ;
103+ const bm25Json = await readFile ( bm25Path , 'utf-8' ) ;
104+ const bm25 = BM25Index . deserialize ( bm25Json ) ;
105+
106+ const searchConfig = {
107+ topK : config . search ?. topK ?? 10 ,
108+ vectorWeight : config . search ?. vectorWeight ?? 0.7 ,
109+ bm25Weight : config . search ?. bm25Weight ?? 0.3 ,
110+ } ;
111+ const hybridSearch = new HybridSearch ( vectorStore , bm25 , embeddingProvider , searchConfig ) ;
112+
113+ const coderagReport = await runBenchmark (
114+ datasetPath ,
115+ 'coderag' ,
116+ async ( query ) => {
117+ const result = await runCodeRAGSearch ( query , hybridSearch ) ;
118+ return { filePaths : normalizePaths ( result . filePaths ) , durationMs : result . durationMs } ;
119+ } ,
120+ ) ;
121+ console . log (
122+ `CodeRAG completed: ${ coderagReport . totalQueries } queries` ,
123+ ) ;
124+ console . log ( '' ) ;
125+
33126 // Generate reports
34- const markdownReport = generateMarkdownReport ( [ grepReport ] ) ;
127+ const markdownReport = generateMarkdownReport ( [ grepReport , coderagReport ] ) ;
35128 console . log ( markdownReport ) ;
36129
37130 // Write JSON report
@@ -40,7 +133,10 @@ async function main(): Promise<void> {
40133
41134 const jsonReportPath = resolve ( resultsDir , 'benchmark-report.json' ) ;
42135 try {
43- await writeFile ( jsonReportPath , JSON . stringify ( grepReport , null , 2 ) ) ;
136+ await writeFile (
137+ jsonReportPath ,
138+ JSON . stringify ( { grep : grepReport , coderag : coderagReport } , null , 2 ) ,
139+ ) ;
44140 console . log ( `JSON report written to: ${ jsonReportPath } ` ) ;
45141 } catch ( error ) {
46142 console . error ( 'Failed to write JSON report:' , error ) ;
0 commit comments