Skip to content

Commit cbc6f3e

Browse files
AB#122 Add benchmark results to README, fix benchmark runners
Wire CodeRAG HybridSearch runner into benchmark suite. Fix grep runner: keyword extraction, exclude node_modules/dist. Normalize file paths (absolute→relative) for metric comparison. Add benchmark results section to README (CodeRAG 2x better than grep). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent b6f1ad0 commit cbc6f3e

3 files changed

Lines changed: 116 additions & 7 deletions

File tree

README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,19 @@ Add to your Claude Desktop MCP config (`~/Library/Application Support/Claude/cla
168168
| Testing | Vitest (2,037 tests) |
169169
| Package manager | pnpm workspaces |
170170

171+
## Benchmarks
172+
173+
Evaluated on 55 curated queries against the CodeRAG codebase itself (easy/medium/hard, function lookups to cross-file reasoning). Grep uses keyword extraction as a baseline.
174+
175+
| Metric | grep | CodeRAG | Improvement |
176+
|--------|------|---------|-------------|
177+
| **Precision@5** | 0.055 | 0.138 | 2.5x |
178+
| **Recall@10** | 0.485 | 0.636 | 1.3x |
179+
| **MRR** | 0.161 | 0.395 | 2.4x |
180+
| **nDCG@10** | 0.221 | 0.437 | 2.0x |
181+
182+
Run `pnpm benchmark` to reproduce. See [`packages/benchmarks/`](packages/benchmarks/) for the full dataset and methodology.
183+
171184
## Documentation
172185

173186
| Page | Description |

packages/benchmarks/src/run-benchmark.ts

Lines changed: 102 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,57 @@
44
* Usage: node --import tsx src/run-benchmark.ts [dataset-path]
55
*/
66

7-
import { resolve, dirname } from 'node:path';
7+
import { resolve, dirname, relative } from 'node:path';
88
import { writeFile, mkdir } from 'node:fs/promises';
99
import { fileURLToPath } from 'node:url';
1010
import { runBenchmark, generateMarkdownReport } from './benchmark.js';
1111
import { runGrepSearch } from './runners/grep-runner.js';
12+
import { runCodeRAGSearch } from './runners/coderag-runner.js';
13+
import {
14+
loadConfig,
15+
LanceDBStore,
16+
BM25Index,
17+
HybridSearch,
18+
OllamaEmbeddingProvider,
19+
} from '@code-rag/core';
1220

1321
const __dirname = dirname(fileURLToPath(import.meta.url));
1422

23+
/**
24+
* Extract keywords from a natural language query for grep.
25+
* Removes stop words and short words to get meaningful search terms.
26+
*/
27+
function extractKeywords(query: string): string {
28+
const stopWords = new Set([
29+
'a', 'an', 'the', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
30+
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
31+
'should', 'may', 'might', 'can', 'shall', 'must',
32+
'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'as',
33+
'into', 'through', 'during', 'before', 'after', 'above', 'below',
34+
'and', 'but', 'or', 'nor', 'not', 'so', 'yet',
35+
'it', 'its', 'this', 'that', 'these', 'those',
36+
'i', 'me', 'my', 'we', 'our', 'you', 'your', 'he', 'she', 'they',
37+
'what', 'which', 'who', 'whom', 'when', 'where', 'why', 'how',
38+
'find', 'show', 'get', 'list', 'display', 'where', 'defined',
39+
'work', 'works', 'working', 'used', 'using', 'use',
40+
'does', 'happen', 'happens', 'between', 'each', 'other',
41+
]);
42+
43+
const words = query
44+
.replace(/[?.,!]/g, '')
45+
.split(/\s+/)
46+
.filter((w) => w.length > 2 && !stopWords.has(w.toLowerCase()));
47+
48+
// If we have PascalCase/camelCase identifiers, prefer those
49+
const identifiers = words.filter((w) => /[A-Z]/.test(w));
50+
if (identifiers.length > 0) {
51+
return identifiers.join('\\|');
52+
}
53+
54+
// Otherwise join top keywords with grep OR
55+
return words.slice(0, 3).join('\\|');
56+
}
57+
1558
async function main(): Promise<void> {
1659
const datasetPath =
1760
process.argv[2] ?? resolve(__dirname, '../datasets/coderag-queries.json');
@@ -21,17 +64,67 @@ async function main(): Promise<void> {
2164
console.log(`Root directory: ${rootDir}`);
2265
console.log('');
2366

24-
// Run grep baseline
67+
/** Normalize absolute paths to relative (from rootDir), filter out worktrees. */
68+
function normalizePaths(filePaths: string[]): string[] {
69+
return filePaths
70+
.filter((p) => !p.includes('.claude/worktrees'))
71+
.map((p) => (p.startsWith('/') ? relative(rootDir, p) : p));
72+
}
73+
74+
// Run grep baseline with keyword extraction
2575
console.log('Running grep baseline...');
2676
const grepReport = await runBenchmark(datasetPath, 'grep', async (query) => {
27-
return runGrepSearch(query, rootDir);
77+
const keywords = extractKeywords(query);
78+
const result = await runGrepSearch(keywords, rootDir);
79+
return { filePaths: normalizePaths(result.filePaths), durationMs: result.durationMs };
2880
});
29-
3081
console.log(`Grep baseline completed: ${grepReport.totalQueries} queries`);
3182
console.log('');
3283

84+
// Run CodeRAG hybrid search
85+
console.log('Running CodeRAG hybrid search...');
86+
const configResult = await loadConfig(rootDir);
87+
if (configResult.isErr()) {
88+
console.error('Failed to load config:', configResult.error);
89+
process.exit(1);
90+
}
91+
92+
const config = configResult.value;
93+
const embeddingProvider = new OllamaEmbeddingProvider({
94+
model: config.embedding?.model ?? 'nomic-embed-text',
95+
baseUrl: config.embedding?.ollamaUrl ?? 'http://localhost:11434',
96+
});
97+
const storageDir = resolve(rootDir, config.storage?.path ?? '.coderag');
98+
const dimensions = config.embedding?.dimensions ?? 768;
99+
const vectorStore = new LanceDBStore(resolve(storageDir, 'lancedb'), dimensions);
100+
// Load BM25 from disk
101+
const { readFile } = await import('node:fs/promises');
102+
const bm25Path = resolve(storageDir, 'bm25-index.json');
103+
const bm25Json = await readFile(bm25Path, 'utf-8');
104+
const bm25 = BM25Index.deserialize(bm25Json);
105+
106+
const searchConfig = {
107+
topK: config.search?.topK ?? 10,
108+
vectorWeight: config.search?.vectorWeight ?? 0.7,
109+
bm25Weight: config.search?.bm25Weight ?? 0.3,
110+
};
111+
const hybridSearch = new HybridSearch(vectorStore, bm25, embeddingProvider, searchConfig);
112+
113+
const coderagReport = await runBenchmark(
114+
datasetPath,
115+
'coderag',
116+
async (query) => {
117+
const result = await runCodeRAGSearch(query, hybridSearch);
118+
return { filePaths: normalizePaths(result.filePaths), durationMs: result.durationMs };
119+
},
120+
);
121+
console.log(
122+
`CodeRAG completed: ${coderagReport.totalQueries} queries`,
123+
);
124+
console.log('');
125+
33126
// Generate reports
34-
const markdownReport = generateMarkdownReport([grepReport]);
127+
const markdownReport = generateMarkdownReport([grepReport, coderagReport]);
35128
console.log(markdownReport);
36129

37130
// Write JSON report
@@ -40,7 +133,10 @@ async function main(): Promise<void> {
40133

41134
const jsonReportPath = resolve(resultsDir, 'benchmark-report.json');
42135
try {
43-
await writeFile(jsonReportPath, JSON.stringify(grepReport, null, 2));
136+
await writeFile(
137+
jsonReportPath,
138+
JSON.stringify({ grep: grepReport, coderag: coderagReport }, null, 2),
139+
);
44140
console.log(`JSON report written to: ${jsonReportPath}`);
45141
} catch (error) {
46142
console.error('Failed to write JSON report:', error);

packages/benchmarks/src/runners/grep-runner.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ export async function runGrepSearch(
6262
try {
6363
const { stdout } = await execFileAsync(
6464
'grep',
65-
['-rn', '--include=*.ts', '--include=*.js', query, rootDir],
65+
['-rn', '--include=*.ts', '--include=*.js', '--exclude-dir=node_modules', '--exclude-dir=dist', '--exclude-dir=.coderag', query, rootDir],
6666
{ maxBuffer: MAX_BUFFER },
6767
);
6868

0 commit comments

Comments
 (0)