AB#122 Add benchmark results to README, fix benchmark runners

maciek-O-digiaidev · claude · maciek-O-digiaidev · commit cbc6f3edbbcc · 2026-02-25T00:21:38.000+01:00
Wire CodeRAG HybridSearch runner into benchmark suite.
Fix grep runner: keyword extraction, exclude node_modules/dist.
Normalize file paths (absolute→relative) for metric comparison.
Add benchmark results section to README (CodeRAG 2x better than grep).

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/README.md b/README.md
@@ -168,6 +168,19 @@ Add to your Claude Desktop MCP config (`~/Library/Application Support/Claude/cla
 | Testing | Vitest (2,037 tests) |
 | Package manager | pnpm workspaces |
 
+## Benchmarks
+
+Evaluated on 55 curated queries against the CodeRAG codebase itself (easy/medium/hard, function lookups to cross-file reasoning). Grep uses keyword extraction as a baseline.
+
+| Metric | grep | CodeRAG | Improvement |
+|--------|------|---------|-------------|
+| **Precision@5** | 0.055 | 0.138 | 2.5x |
+| **Recall@10** | 0.485 | 0.636 | 1.3x |
+| **MRR** | 0.161 | 0.395 | 2.4x |
+| **nDCG@10** | 0.221 | 0.437 | 2.0x |
+
+Run `pnpm benchmark` to reproduce. See [`packages/benchmarks/`](packages/benchmarks/) for the full dataset and methodology.
+
 ## Documentation
 
 | Page | Description |
diff --git a/packages/benchmarks/src/run-benchmark.ts b/packages/benchmarks/src/run-benchmark.ts
@@ -4,14 +4,57 @@
  * Usage: node --import tsx src/run-benchmark.ts [dataset-path]
  */
 
-import { resolve, dirname } from 'node:path';
+import { resolve, dirname, relative } from 'node:path';
 import { writeFile, mkdir } from 'node:fs/promises';
 import { fileURLToPath } from 'node:url';
 import { runBenchmark, generateMarkdownReport } from './benchmark.js';
 import { runGrepSearch } from './runners/grep-runner.js';
+import { runCodeRAGSearch } from './runners/coderag-runner.js';
+import {
+  loadConfig,
+  LanceDBStore,
+  BM25Index,
+  HybridSearch,
+  OllamaEmbeddingProvider,
+} from '@code-rag/core';
 
 const __dirname = dirname(fileURLToPath(import.meta.url));
 
+/**
+ * Extract keywords from a natural language query for grep.
+ * Removes stop words and short words to get meaningful search terms.
+ */
+function extractKeywords(query: string): string {
+  const stopWords = new Set([
+    'a', 'an', 'the', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
+    'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
+    'should', 'may', 'might', 'can', 'shall', 'must',
+    'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'as',
+    'into', 'through', 'during', 'before', 'after', 'above', 'below',
+    'and', 'but', 'or', 'nor', 'not', 'so', 'yet',
+    'it', 'its', 'this', 'that', 'these', 'those',
+    'i', 'me', 'my', 'we', 'our', 'you', 'your', 'he', 'she', 'they',
+    'what', 'which', 'who', 'whom', 'when', 'where', 'why', 'how',
+    'find', 'show', 'get', 'list', 'display', 'where', 'defined',
+    'work', 'works', 'working', 'used', 'using', 'use',
+    'does', 'happen', 'happens', 'between', 'each', 'other',
+  ]);
+
+  const words = query
+    .replace(/[?.,!]/g, '')
+    .split(/\s+/)
+    .filter((w) => w.length > 2 && !stopWords.has(w.toLowerCase()));
+
+  // If we have PascalCase/camelCase identifiers, prefer those
+  const identifiers = words.filter((w) => /[A-Z]/.test(w));
+  if (identifiers.length > 0) {
+    return identifiers.join('\\|');
+  }
+
+  // Otherwise join top keywords with grep OR
+  return words.slice(0, 3).join('\\|');
+}
+
 async function main(): Promise<void> {
   const datasetPath =
     process.argv[2] ?? resolve(__dirname, '../datasets/coderag-queries.json');
@@ -21,17 +64,67 @@ async function main(): Promise<void> {
   console.log(`Root directory: ${rootDir}`);
   console.log('');
 
-  // Run grep baseline
+  /** Normalize absolute paths to relative (from rootDir), filter out worktrees. */
+  function normalizePaths(filePaths: string[]): string[] {
+    return filePaths
+      .filter((p) => !p.includes('.claude/worktrees'))
+      .map((p) => (p.startsWith('/') ? relative(rootDir, p) : p));
+  }
+
+  // Run grep baseline with keyword extraction
   console.log('Running grep baseline...');
   const grepReport = await runBenchmark(datasetPath, 'grep', async (query) => {
-    return runGrepSearch(query, rootDir);
+    const keywords = extractKeywords(query);
+    const result = await runGrepSearch(keywords, rootDir);
+    return { filePaths: normalizePaths(result.filePaths), durationMs: result.durationMs };
   });
-
   console.log(`Grep baseline completed: ${grepReport.totalQueries} queries`);
   console.log('');
 
+  // Run CodeRAG hybrid search
+  console.log('Running CodeRAG hybrid search...');
+  const configResult = await loadConfig(rootDir);
+  if (configResult.isErr()) {
+    console.error('Failed to load config:', configResult.error);
+    process.exit(1);
+  }
+
+  const config = configResult.value;
+  const embeddingProvider = new OllamaEmbeddingProvider({
+    model: config.embedding?.model ?? 'nomic-embed-text',
+    baseUrl: config.embedding?.ollamaUrl ?? 'http://localhost:11434',
+  });
+  const storageDir = resolve(rootDir, config.storage?.path ?? '.coderag');
+  const dimensions = config.embedding?.dimensions ?? 768;
+  const vectorStore = new LanceDBStore(resolve(storageDir, 'lancedb'), dimensions);
+  // Load BM25 from disk
+  const { readFile } = await import('node:fs/promises');
+  const bm25Path = resolve(storageDir, 'bm25-index.json');
+  const bm25Json = await readFile(bm25Path, 'utf-8');
+  const bm25 = BM25Index.deserialize(bm25Json);
+
+  const searchConfig = {
+    topK: config.search?.topK ?? 10,
+    vectorWeight: config.search?.vectorWeight ?? 0.7,
+    bm25Weight: config.search?.bm25Weight ?? 0.3,
+  };
+  const hybridSearch = new HybridSearch(vectorStore, bm25, embeddingProvider, searchConfig);
+
+  const coderagReport = await runBenchmark(
+    datasetPath,
+    'coderag',
+    async (query) => {
+      const result = await runCodeRAGSearch(query, hybridSearch);
+      return { filePaths: normalizePaths(result.filePaths), durationMs: result.durationMs };
+    },
+  );
+  console.log(
+    `CodeRAG completed: ${coderagReport.totalQueries} queries`,
+  );
+  console.log('');
+
   // Generate reports
-  const markdownReport = generateMarkdownReport([grepReport]);
+  const markdownReport = generateMarkdownReport([grepReport, coderagReport]);
   console.log(markdownReport);
 
   // Write JSON report
@@ -40,7 +133,10 @@ async function main(): Promise<void> {
 
   const jsonReportPath = resolve(resultsDir, 'benchmark-report.json');
   try {
-    await writeFile(jsonReportPath, JSON.stringify(grepReport, null, 2));
+    await writeFile(
+      jsonReportPath,
+      JSON.stringify({ grep: grepReport, coderag: coderagReport }, null, 2),
+    );
     console.log(`JSON report written to: ${jsonReportPath}`);
   } catch (error) {
     console.error('Failed to write JSON report:', error);
diff --git a/packages/benchmarks/src/runners/grep-runner.ts b/packages/benchmarks/src/runners/grep-runner.ts
@@ -62,7 +62,7 @@ export async function runGrepSearch(
   try {
     const { stdout } = await execFileAsync(
       'grep',
-      ['-rn', '--include=*.ts', '--include=*.js', query, rootDir],
+      ['-rn', '--include=*.ts', '--include=*.js', '--exclude-dir=node_modules', '--exclude-dir=dist', '--exclude-dir=.coderag', query, rootDir],
       { maxBuffer: MAX_BUFFER },
     );