diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 438f495c4..94e2f131a 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -9,7 +9,7 @@ on: permissions: {} jobs: - benchmark: + build-benchmark: runs-on: ubuntu-latest if: >- github.event_name == 'workflow_dispatch' || @@ -31,16 +31,22 @@ jobs: - run: npm install - - name: Run benchmark + - name: Run build benchmark run: node scripts/benchmark.js 2>/dev/null > benchmark-result.json - - name: Update report + - name: Update build report run: node scripts/update-benchmark-report.js benchmark-result.json + - name: Upload build result + uses: actions/upload-artifact@v4 + with: + name: build-benchmark-result + path: benchmark-result.json + - name: Check for changes id: changes run: | - if git diff --quiet HEAD -- generated/BENCHMARKS.md README.md; then + if git diff --quiet HEAD -- generated/BUILD-BENCHMARKS.md README.md; then echo "changed=false" >> "$GITHUB_OUTPUT" else echo "changed=true" >> "$GITHUB_OUTPUT" @@ -54,20 +60,89 @@ jobs: git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" - BRANCH="benchmark/update-$(date +%Y%m%d-%H%M%S)" + BRANCH="benchmark/build-$(date +%Y%m%d-%H%M%S)" git checkout -b "$BRANCH" - git add generated/BENCHMARKS.md README.md - git commit -m "docs: update performance benchmarks" + git add generated/BUILD-BENCHMARKS.md README.md + git commit -m "docs: update build performance benchmarks" git push origin "$BRANCH" gh pr create \ --base main \ --head "$BRANCH" \ - --title "docs: update performance benchmarks" \ - --body "Automated benchmark update from workflow run [#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})." + --title "docs: update build performance benchmarks" \ + --body "Automated build benchmark update from workflow run [#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})." + + embedding-benchmark: + runs-on: ubuntu-latest + if: >- + github.event_name == 'workflow_dispatch' || + github.event.workflow_run.conclusion == 'success' + permissions: + contents: write + pull-requests: write + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: main + token: ${{ secrets.GITHUB_TOKEN }} + + - uses: actions/setup-node@v4 + with: + node-version: "22" + + - run: npm install + + - name: Cache HuggingFace models + uses: actions/cache@v4 + with: + path: ~/.cache/huggingface + key: hf-models-${{ runner.os }}-${{ hashFiles('src/embedder.js') }} + restore-keys: hf-models-${{ runner.os }}- + + - name: Build graph + run: node src/cli.js build . - - name: Upload result artifact + - name: Run embedding benchmark + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: node scripts/embedding-benchmark.js 2>/dev/null > embedding-benchmark-result.json + + - name: Update embedding report + run: node scripts/update-embedding-report.js embedding-benchmark-result.json + + - name: Upload embedding result uses: actions/upload-artifact@v4 with: - name: benchmark-result - path: benchmark-result.json + name: embedding-benchmark-result + path: embedding-benchmark-result.json + + - name: Check for changes + id: changes + run: | + if git diff --quiet HEAD -- generated/EMBEDDING-BENCHMARKS.md; then + echo "changed=false" >> "$GITHUB_OUTPUT" + else + echo "changed=true" >> "$GITHUB_OUTPUT" + fi + + - name: Commit and push via PR + if: steps.changes.outputs.changed == 'true' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + BRANCH="benchmark/embedding-$(date +%Y%m%d-%H%M%S)" + git checkout -b "$BRANCH" + git add generated/EMBEDDING-BENCHMARKS.md + git commit -m "docs: update embedding benchmarks" + git push origin "$BRANCH" + + gh pr create \ + --base main \ + --head "$BRANCH" \ + --title "docs: update embedding benchmarks" \ + --body "Automated embedding benchmark update from workflow run [#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})." diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8239ffa83..53ec6a57a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -141,6 +141,47 @@ tests/ - Parser tests use inline code strings parsed directly with tree-sitter - Always run the full suite (`npm test`) before submitting a PR +## Regression Benchmarks + +Two regression benchmark scripts live in `scripts/`. These are **not** unit +tests — they measure performance metrics that reviewers use to judge whether a +change is acceptable. If your PR touches code covered by a benchmark, you +**must** run it before and after your changes and include the results in the PR +description. + +| Benchmark | What it measures | When to run | +|-----------|-----------------|-------------| +| `node scripts/benchmark.js` | Build speed (native vs WASM), query latency | Changes to `builder.js`, `parser.js`, `queries.js`, `resolve.js`, `db.js`, or the native engine | +| `node scripts/embedding-benchmark.js` | Search recall (Hit@1/3/5/10) across models | Changes to `embedder.js` or embedding strategies | + +### How to report results + +Both scripts output JSON to stdout (progress goes to stderr). Run the relevant +benchmark on `main` (before), then on your branch (after), and paste both in +your PR description: + +```bash +git stash && git checkout main +node scripts/benchmark.js > before.json + +git checkout - && git stash pop +node scripts/benchmark.js > after.json +``` + +In the PR, include a table like: + +``` +## Benchmark results + +| Metric | Before | After | Delta | +|--------------|--------|--------|-------| +| Build (ms) | 1200 | 1180 | -20 | +| Hit@1 | 75.5% | 76.2% | +0.7% | +``` + +Regressions are not automatically blocking, but unexplained drops in speed or +recall will be questioned during review. + ## Common Contribution Types ### Bug Fixes diff --git a/README.md b/README.md index cdc82d778..ddee7f34d 100644 --- a/README.md +++ b/README.md @@ -373,7 +373,7 @@ Codegraph also extracts symbols from common callback patterns: Commander `.comma ## 📊 Performance -Self-measured on every release via CI ([full history](generated/BENCHMARKS.md)): +Self-measured on every release via CI ([build benchmarks](generated/BUILD-BENCHMARKS.md) | [embedding benchmarks](generated/EMBEDDING-BENCHMARKS.md)): | Metric | Latest | |---|---| diff --git a/generated/BENCHMARKS.md b/generated/BUILD-BENCHMARKS.md similarity index 100% rename from generated/BENCHMARKS.md rename to generated/BUILD-BENCHMARKS.md diff --git a/scripts/embedding-benchmark.js b/scripts/embedding-benchmark.js new file mode 100644 index 000000000..73fe2d8e0 --- /dev/null +++ b/scripts/embedding-benchmark.js @@ -0,0 +1,145 @@ +#!/usr/bin/env node + +/** + * Embedding benchmark runner — measures search recall across all models. + * + * For every function/method/class in the graph, generates a query from the + * symbol name (splitIdentifier) and checks if search finds that symbol. + * Tests all available embedding models, outputs JSON to stdout. + * + * Skips jina-code when HF_TOKEN is not set (gated model). + * + * Usage: node scripts/embedding-benchmark.js > result.json + */ + +import fs from 'node:fs'; +import path from 'node:path'; +import { performance } from 'node:perf_hooks'; +import { fileURLToPath } from 'node:url'; +import Database from 'better-sqlite3'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const root = path.resolve(__dirname, '..'); + +const pkg = JSON.parse(fs.readFileSync(path.join(root, 'package.json'), 'utf8')); +const dbPath = path.join(root, '.codegraph', 'graph.db'); + +const { buildEmbeddings, MODELS, searchData } = await import( + new URL('../src/embedder.js', import.meta.url).href +); + +// Redirect console.log to stderr so only JSON goes to stdout +const origLog = console.log; +console.log = (...args) => console.error(...args); + +const TEST_PATTERN = /\.(test|spec)\.|__test__|__tests__|\.stories\./; + +function splitIdentifier(name) { + return name + .replace(/([a-z])([A-Z])/g, '$1 $2') + .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2') + .replace(/[_-]+/g, ' ') + .trim(); +} + +function loadSymbols() { + const db = new Database(dbPath, { readonly: true }); + let rows = db + .prepare( + `SELECT name, kind, file FROM nodes WHERE kind IN ('function', 'method', 'class') ORDER BY file, line`, + ) + .all(); + db.close(); + + rows = rows.filter((r) => !TEST_PATTERN.test(r.file)); + + const seen = new Set(); + const symbols = []; + for (const row of rows) { + if (seen.has(row.name)) continue; + seen.add(row.name); + const query = splitIdentifier(row.name); + if (query.length < 4) continue; + symbols.push({ name: row.name, kind: row.kind, file: row.file, query }); + } + return symbols; +} + +async function benchmarkModel(modelKey, symbols) { + const embedStart = performance.now(); + await buildEmbeddings(root, modelKey, dbPath, { strategy: 'structured' }); + const embedTimeMs = Math.round(performance.now() - embedStart); + + let hits1 = 0; + let hits3 = 0; + let hits5 = 0; + let hits10 = 0; + + const searchStart = performance.now(); + for (const { name, query } of symbols) { + const data = await searchData(query, dbPath, { minScore: 0.01, limit: 10 }); + if (!data) continue; + + const names = data.results.map((r) => r.name); + const rank = names.indexOf(name) + 1; + if (rank === 1) hits1++; + if (rank >= 1 && rank <= 3) hits3++; + if (rank >= 1 && rank <= 5) hits5++; + if (rank >= 1 && rank <= 10) hits10++; + } + const searchTimeMs = Math.round(performance.now() - searchStart); + + const total = symbols.length; + return { + dim: MODELS[modelKey].dim, + contextWindow: MODELS[modelKey].contextWindow, + hits1, + hits3, + hits5, + hits10, + misses: total - hits10, + total, + embedTimeMs, + searchTimeMs, + }; +} + +// ── Run benchmarks ────────────────────────────────────────────────────── + +const symbols = loadSymbols(); +console.error(`Loaded ${symbols.length} symbols for benchmark`); + +const hasHfToken = !!process.env.HF_TOKEN; +const modelKeys = Object.keys(MODELS); +const results = {}; + +for (const key of modelKeys) { + if (key === 'jina-code' && !hasHfToken) { + console.error(`Skipping ${key} (HF_TOKEN not set)`); + continue; + } + + console.error(`\nBenchmarking model: ${key}...`); + try { + results[key] = await benchmarkModel(key, symbols); + const r = results[key]; + console.error( + ` Hit@1=${r.hits1}/${r.total} Hit@3=${r.hits3}/${r.total} Hit@5=${r.hits5}/${r.total} misses=${r.misses}`, + ); + } catch (err) { + console.error(` FAILED: ${err.message}`); + } +} + +// Restore console.log for JSON output +console.log = origLog; + +const output = { + version: pkg.version, + date: new Date().toISOString().slice(0, 10), + strategy: 'structured', + symbols: symbols.length, + models: results, +}; + +console.log(JSON.stringify(output, null, 2)); diff --git a/scripts/update-benchmark-report.js b/scripts/update-benchmark-report.js index 3a18393ae..0f2bb1b59 100644 --- a/scripts/update-benchmark-report.js +++ b/scripts/update-benchmark-report.js @@ -2,7 +2,7 @@ /** * Update benchmark report — reads benchmark JSON and updates: - * 1. generated/BENCHMARKS.md (historical table + raw JSON in HTML comment) + * 1. generated/BUILD-BENCHMARKS.md (historical table + raw JSON in HTML comment) * 2. README.md (performance section with latest numbers) * * Usage: @@ -28,10 +28,10 @@ if (arg) { const entry = JSON.parse(jsonText); // ── Paths ──────────────────────────────────────────────────────────────── -const benchmarkPath = path.join(root, 'generated', 'BENCHMARKS.md'); +const benchmarkPath = path.join(root, 'generated', 'BUILD-BENCHMARKS.md'); const readmePath = path.join(root, 'README.md'); -// ── Load existing history from BENCHMARKS.md ───────────────────────────── +// ── Load existing history from BUILD-BENCHMARKS.md ───────────────────────────── let history = []; if (fs.existsSync(benchmarkPath)) { const content = fs.readFileSync(benchmarkPath, 'utf8'); @@ -96,7 +96,7 @@ function engineRow(h, prev, engineKey) { ); } -// ── Build BENCHMARKS.md ────────────────────────────────────────────────── +// ── Build BUILD-BENCHMARKS.md ────────────────────────────────────────────────── let md = '# Codegraph Performance Benchmarks\n\n'; md += 'Self-measured on every release by running codegraph on its own codebase.\n'; md += 'Metrics are normalized per file for cross-version comparability.\n\n'; @@ -177,7 +177,7 @@ if (fs.existsSync(readmePath)) { const perfSection = `## 📊 Performance -Self-measured on every release via CI ([full history](generated/BENCHMARKS.md)): +Self-measured on every release via CI ([build benchmarks](generated/BUILD-BENCHMARKS.md) | [embedding benchmarks](generated/EMBEDDING-BENCHMARKS.md)): | Metric | Latest | |---|---| diff --git a/scripts/update-embedding-report.js b/scripts/update-embedding-report.js new file mode 100644 index 000000000..d866eb8e9 --- /dev/null +++ b/scripts/update-embedding-report.js @@ -0,0 +1,134 @@ +#!/usr/bin/env node + +/** + * Update embedding benchmark report — reads benchmark JSON and updates: + * generated/EMBEDDING-BENCHMARKS.md (historical table + raw JSON in HTML comment) + * + * Usage: + * node scripts/update-embedding-report.js embedding-benchmark-result.json + * node scripts/embedding-benchmark.js | node scripts/update-embedding-report.js + */ + +import fs from 'node:fs'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const root = path.resolve(__dirname, '..'); + +// ── Read benchmark JSON from file arg or stdin ─────────────────────────── +let jsonText; +const arg = process.argv[2]; +if (arg) { + jsonText = fs.readFileSync(path.resolve(arg), 'utf8'); +} else { + jsonText = fs.readFileSync('/dev/stdin', 'utf8'); +} +const entry = JSON.parse(jsonText); + +// ── Paths ──────────────────────────────────────────────────────────────── +const reportPath = path.join(root, 'generated', 'EMBEDDING-BENCHMARKS.md'); + +// ── Load existing history ──────────────────────────────────────────────── +let history = []; +if (fs.existsSync(reportPath)) { + const content = fs.readFileSync(reportPath, 'utf8'); + const match = content.match(//); + if (match) { + try { + history = JSON.parse(match[1]); + } catch { + /* start fresh if corrupt */ + } + } +} + +// Add new entry (deduplicate by version) +const idx = history.findIndex((h) => h.version === entry.version); +if (idx >= 0) { + history[idx] = entry; +} else { + history.unshift(entry); +} + +// ── Helpers ────────────────────────────────────────────────────────────── +function pct(n, total) { + return `${((n / total) * 100).toFixed(1)}%`; +} + +function trend(current, previous) { + if (previous == null) return ''; + const diff = current - previous; + if (Math.abs(diff) < 0.5) return ' ~'; + return diff > 0 ? ` ↑${diff.toFixed(1)}pp` : ` ↓${Math.abs(diff).toFixed(1)}pp`; +} + +function pctVal(n, total) { + return (n / total) * 100; +} + +function formatMs(ms) { + if (ms >= 1000) return `${(ms / 1000).toFixed(1)}s`; + return `${Math.round(ms)}ms`; +} + +// ── Build EMBEDDING-BENCHMARKS.md ──────────────────────────────────────── +let md = '# Codegraph Embedding Benchmarks\n\n'; +md += 'Self-measured on every release using auto-generated queries from symbol names.\n'; +md += 'Each symbol\'s name is split into words (e.g. `buildGraph` → `"build graph"`) and used as the search query.\n'; +md += 'Hit@N = expected symbol found in top N results.\n\n'; + +md += + '| Version | Model | Symbols | Hit@1 | Hit@3 | Hit@5 | Misses | Embed Time |\n'; +md += + '|---------|-------|--------:|------:|------:|------:|-------:|-----------:|\n'; + +for (let i = 0; i < history.length; i++) { + const h = history[i]; + const prev = history[i + 1] || null; + + for (const [modelKey, m] of Object.entries(h.models)) { + const pm = prev?.models?.[modelKey] || null; + + const h1 = pctVal(m.hits1, m.total); + const h3 = pctVal(m.hits3, m.total); + const h5 = pctVal(m.hits5, m.total); + const ph1 = pm ? pctVal(pm.hits1, pm.total) : null; + const ph3 = pm ? pctVal(pm.hits3, pm.total) : null; + const ph5 = pm ? pctVal(pm.hits5, pm.total) : null; + + md += `| ${h.version} | ${modelKey} | ${m.total} `; + md += `| ${pct(m.hits1, m.total)}${trend(h1, ph1)} `; + md += `| ${pct(m.hits3, m.total)}${trend(h3, ph3)} `; + md += `| ${pct(m.hits5, m.total)}${trend(h5, ph5)} `; + md += `| ${m.misses} `; + md += `| ${formatMs(m.embedTimeMs)} |\n`; + } +} + +// ── Latest summary ─────────────────────────────────────────────────────── +const latest = history[0]; +md += '\n### Latest results\n\n'; +md += `**Version:** ${latest.version} | **Strategy:** ${latest.strategy} | **Symbols:** ${latest.symbols} | **Date:** ${latest.date}\n\n`; + +md += '| Model | Dim | Context | Hit@1 | Hit@3 | Hit@5 | Hit@10 | Misses | Embed | Search |\n'; +md += '|-------|----:|--------:|------:|------:|------:|-------:|-------:|------:|-------:|\n'; + +for (const [modelKey, m] of Object.entries(latest.models)) { + md += `| ${modelKey} `; + md += `| ${m.dim} `; + md += `| ${m.contextWindow} `; + md += `| ${pct(m.hits1, m.total)} `; + md += `| ${pct(m.hits3, m.total)} `; + md += `| ${pct(m.hits5, m.total)} `; + md += `| ${pct(m.hits10, m.total)} `; + md += `| ${m.misses} `; + md += `| ${formatMs(m.embedTimeMs)} `; + md += `| ${formatMs(m.searchTimeMs)} |\n`; +} + +md += `\n\n`; + +fs.mkdirSync(path.dirname(reportPath), { recursive: true }); +fs.writeFileSync(reportPath, md); +console.error(`Updated ${path.relative(root, reportPath)}`); diff --git a/tests/search/embedding-benchmark.js b/tests/search/embedding-benchmark.js deleted file mode 100644 index 11dc9aad0..000000000 --- a/tests/search/embedding-benchmark.js +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/env node - -/** - * Embedding strategy benchmark — compares structured vs source strategies - * against real search queries on the current project's graph. - * - * Prerequisites: - * - @huggingface/transformers installed - * - codegraph build already run (graph.db exists) - * - * Usage: - * node tests/search/embedding-benchmark.js - * node tests/search/embedding-benchmark.js --model minilm - */ - -import path from 'node:path'; -import { buildEmbeddings, DEFAULT_MODEL, MODELS, searchData } from '../../src/embedder.js'; - -const model = process.argv.includes('--model') - ? process.argv[process.argv.indexOf('--model') + 1] - : DEFAULT_MODEL; - -const rootDir = '.'; -const dbPath = path.resolve('.codegraph/graph.db'); - -// Queries with expected best-match symbol name -const QUERIES = [ - { q: 'parse source code with tree-sitter', expect: 'parseFilesAuto' }, - { q: 'find circular dependencies', expect: 'findCycles' }, - { q: 'build dependency graph from source files', expect: 'buildGraph' }, - { q: 'resolve import path to actual file', expect: 'resolveImportPath' }, - { q: 'cosine similarity between vectors', expect: 'cosineSim' }, - { q: 'export graph as DOT format', expect: 'exportDOT' }, - { q: 'semantic search with embeddings', expect: 'search' }, - { q: 'incremental file hashing', expect: 'hashFile' }, - { q: 'load configuration from file', expect: 'loadConfig' }, - { q: 'extract functions and classes from code', expect: 'extractJavaScript' }, - { q: 'impact analysis of code changes', expect: 'diffImpactData' }, - { q: 'start MCP server for AI agents', expect: 'startMCPServer' }, - { q: 'watch files for changes', expect: 'watchProject' }, - { q: 'reciprocal rank fusion for multi-query search', expect: 'multiSearchData' }, -]; - -async function benchmark(strategy) { - await buildEmbeddings(rootDir, model, dbPath, { strategy }); - - let hits1 = 0; - let hits3 = 0; - let hits5 = 0; - const details = []; - - for (const { q, expect: expected } of QUERIES) { - const data = await searchData(q, dbPath, { minScore: 0.01, limit: 10 }); - if (!data) continue; - - const names = data.results.map((r) => r.name); - const rank = names.indexOf(expected) + 1; // 0 = not found - if (rank === 1) hits1++; - if (rank >= 1 && rank <= 3) hits3++; - if (rank >= 1 && rank <= 5) hits5++; - - const matchScore = rank > 0 ? data.results[rank - 1].similarity.toFixed(3) : 'miss'; - details.push({ - q: q.slice(0, 50), - expected, - rank: rank || '>10', - actual: names[0], - matchScore, - }); - } - - return { strategy, hits1, hits3, hits5, total: QUERIES.length, details }; -} - -const modelConfig = MODELS[model]; -console.log('=== Embedding Strategy Benchmark ==='); -console.log(`Model: ${model} (${modelConfig.dim}d, ${modelConfig.contextWindow} token context)`); -console.log(`Queries: ${QUERIES.length}`); -console.log(''); - -const structured = await benchmark('structured'); -const source = await benchmark('source'); - -// Summary table -console.log(''); -console.log('=== RESULTS ==='); -console.log(''); -console.log(`${'Metric'.padEnd(12)}${'structured'.padEnd(16)}${'source'.padEnd(16)}delta`); -for (const [label, key] of [ - ['Hit@1', 'hits1'], - ['Hit@3', 'hits3'], - ['Hit@5', 'hits5'], -]) { - const s = structured[key]; - const o = source[key]; - const sp = `${s}/${structured.total} (${((s / structured.total) * 100).toFixed(0)}%)`; - const op = `${o}/${source.total} (${((o / source.total) * 100).toFixed(0)}%)`; - const delta = s - o; - const sign = delta > 0 ? '+' : ''; - console.log(`${label.padEnd(12)}${sp.padEnd(16)}${op.padEnd(16)}${sign}${delta}`); -} - -// Per-query comparison -console.log(''); -console.log(`${'Query'.padEnd(52)}${'Expected'.padEnd(22)}Struct Source`); -for (let i = 0; i < QUERIES.length; i++) { - const s = structured.details[i]; - const o = source.details[i]; - const sw = - typeof s.rank === 'number' && (typeof o.rank !== 'number' || s.rank < o.rank) ? '*' : ' '; - const ow = - typeof o.rank === 'number' && (typeof s.rank !== 'number' || o.rank < s.rank) ? '*' : ' '; - console.log( - s.q.padEnd(52) + - s.expected.padEnd(22) + - String(s.rank).padEnd(4) + - sw + - ' ' + - String(o.rank).padEnd(4) + - ow, - ); -} -console.log(''); -console.log('* = better rank for that query');