Skip to content

Record codebase-memory partial readiness blockers #4

Record codebase-memory partial readiness blockers

Record codebase-memory partial readiness blockers #4

name: ContextBench CBM Readiness Retry
on:
push:
branches: [master]
paths:
- .github/workflows/contextbench-cbm-readiness-retry.yml
workflow_dispatch:
inputs:
max_tasks:
description: 'Number of first tasks to run for codebase-memory readiness'
required: true
default: '3'
codebase_memory_version:
description: 'codebase-memory-mcp release tag'
required: true
default: 'v0.6.1'
permissions:
contents: read
jobs:
codebase-memory-first3-readiness:
runs-on: ubuntu-latest
timeout-minutes: 360
env:
ROOT: /tmp/contextbench-cbm-readiness
TASK_PAYLOADS: /tmp/contextbench-cbm-readiness/task-payloads.json
CHECKOUT_ROOT: /tmp/contextbench-checkouts
CBM_VERSION: ${{ github.event.inputs.codebase_memory_version || 'v0.6.1' }}
MAX_TASKS: ${{ github.event.inputs.max_tasks || '3' }}
steps:
- uses: actions/checkout@v4
- uses: pnpm/action-setup@v2
with:
version: 10
- uses: actions/setup-node@v4
with:
node-version: '24'
cache: 'pnpm'
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: |
pnpm install --frozen-lockfile
python -m pip install "tree-sitter==0.20.4" "tree-sitter-languages==1.10.2" datasets pyarrow
- name: Validate fixtures and materialize first tasks
run: |
mkdir -p "$ROOT" "$CHECKOUT_ROOT"
node scripts/contextbench-runner.mjs --validate-fixtures
node scripts/contextbench-select-slice.mjs --write-task-payloads --out "$TASK_PAYLOADS" --checkout-root "$CHECKOUT_ROOT"
node scripts/contextbench-select-slice.mjs --materialize-checkouts --payloads "$TASK_PAYLOADS" --max-tasks "$MAX_TASKS"
- name: Download codebase-memory-mcp
run: |
set -euxo pipefail
mkdir -p "$ROOT/tool"
curl -fsSL "https://github.com/DeusData/codebase-memory-mcp/releases/download/${CBM_VERSION}/codebase-memory-mcp-linux-amd64.tar.gz" -o "$ROOT/tool/cbm.tar.gz"
tar -xzf "$ROOT/tool/cbm.tar.gz" -C "$ROOT/tool"
chmod +x "$ROOT/tool/codebase-memory-mcp" || true
"$ROOT/tool/codebase-memory-mcp" --version || true
- name: Run readiness gate with official evaluator
env:
CBM_BIN: /tmp/contextbench-cbm-readiness/tool/codebase-memory-mcp
run: |
cat > "$ROOT/readiness.mjs" <<'NODE'
import { spawnSync } from 'node:child_process';
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
import { basename, join } from 'node:path';
const root = process.env.ROOT;
const payloads = JSON.parse(readFileSync(process.env.TASK_PAYLOADS, 'utf8'));
const tasks = payloads.tasks.slice(0, Number(process.env.MAX_TASKS || '3'));
const cbm = process.env.CBM_BIN;
const outRoot = join(root, 'out');
mkdirSync(outRoot, { recursive: true });
function run(cmd, args, opts = {}) {
const started = Date.now();
const r = spawnSync(cmd, args, {
cwd: opts.cwd || process.cwd(),
env: opts.env || process.env,
encoding: 'utf8',
timeout: opts.timeoutMs || 20 * 60 * 1000,
maxBuffer: 64 * 1024 * 1024
});
return { command: [cmd, ...args].join(' '), cwd: opts.cwd || process.cwd(), status: r.status, signal: r.signal, error: r.error?.message || null, durationMs: Date.now() - started, stdout: r.stdout || '', stderr: r.stderr || '' };
}
function firstOk(label, candidates, opts) {
const attempts = [];
for (const args of candidates) {
const attempt = run(cbm, args, opts);
attempts.push(attempt);
if (attempt.status === 0) return { ...attempt, label, attempts, selectedBecause: 'status0' };
}
return { ...(attempts.at(-1) || {}), label, attempts, selectedBecause: 'no_status0' };
}
function firstUseful(label, candidates, opts) {
const attempts = [];
let firstSuccess = null;
for (const args of candidates) {
const attempt = run(cbm, args, opts);
attempts.push(attempt);
if (attempt.status === 0 && !firstSuccess) firstSuccess = attempt;
if (attempt.status === 0 && hasUsefulLocations(attempt)) return { ...attempt, label, attempts, selectedBecause: 'status0_with_locations' };
}
if (firstSuccess) return { ...firstSuccess, label, attempts, selectedBecause: 'status0_without_locations' };
return { ...(attempts.at(-1) || {}), label, attempts, selectedBecause: 'no_status0' };
}
function queryOf(text) {
return String(text || '').replace(/[`*_#>\[\](){},.;:!?/\\]/g, ' ').split(/\s+/).filter((w) => w.length >= 4).slice(0, 8).join(' ');
}
function jsonish(s) {
const t = String(s || '').trim();
if (!t) return null;
try { return JSON.parse(t); } catch {}
for (const [a, b] of [['{', '}'], ['[', ']']]) {
const i = t.indexOf(a), j = t.lastIndexOf(b);
if (i >= 0 && j > i) { try { return JSON.parse(t.slice(i, j + 1)); } catch {} }
}
return null;
}
function payloadsFrom(result) {
const out = [];
for (const text of [result?.stdout, result?.stderr]) {
const parsed = jsonish(text);
if (!parsed) continue;
out.push(parsed);
const content = parsed.content;
if (Array.isArray(content)) {
for (const item of content) {
const nested = jsonish(item?.text);
if (nested) out.push(nested);
}
}
}
return out;
}
function projectFrom(...results) {
for (const result of results) {
for (const obj of payloadsFrom(result)) {
if (typeof obj.project === 'string' && obj.project) return obj.project;
if (Array.isArray(obj.projects)) {
for (const entry of obj.projects) {
if (typeof entry === 'string' && entry) return entry;
if (typeof entry?.project === 'string' && entry.project) return entry.project;
if (typeof entry?.name === 'string' && entry.name) return entry.name;
}
}
}
}
return '';
}
function add(spans, file, start = 1, end = start) {
if (typeof file !== 'string' || !file) return;
const clean = file.replace(/^\/+/, '');
const s = Math.max(1, Number(start) || 1);
const e = Math.max(s, Number(end) || s);
const list = spans.get(clean) || [];
list.push({ start: s, end: e });
spans.set(clean, list);
}
function walk(v, spans) {
if (!v || typeof v !== 'object') return;
if (Array.isArray(v)) { for (const x of v) walk(x, spans); return; }
const file = v.file || v.path || v.file_path || v.relative_path || v.filename || v.source_path;
const start = v.start_line || v.startLine || v.line || v.line_number || v.start || 1;
const end = v.end_line || v.endLine || v.end || start;
add(spans, file, start, end);
for (const x of Object.values(v)) walk(x, spans);
}
function textPaths(s, spans) {
const re = /([A-Za-z0-9_.\/-]+\.(?:js|jsx|ts|tsx|py|go|rs|java|c|cc|cpp|h|hpp|rb|php|cs|kt|swift|vue|svelte|json|yml|yaml|md))(?::|#L|\s+line\s+)?(\d+)?/g;
let m;
while ((m = re.exec(String(s || ''))) !== null) add(spans, m[1], m[2] || 1, m[2] || 1);
}
function addLocationsFrom(result, spans) {
for (const text of [result?.stdout, result?.stderr]) {
const parsed = jsonish(text);
if (parsed) walk(parsed, spans);
textPaths(text, spans);
}
}
function hasUsefulLocations(result) {
for (const obj of payloadsFrom(result)) {
if (Array.isArray(obj.results) && obj.results.length > 0) return true;
if (Array.isArray(obj.raw_matches) && obj.raw_matches.length > 0) return true;
if (Number(obj.total_results || 0) > 0 || Number(obj.raw_match_count || 0) > 0 || Number(obj.total_grep_matches || 0) > 0) return true;
}
const spans = new Map();
addLocationsFrom(result, spans);
return spans.size > 0;
}
function regexLiteral(value) {
return String(value || '').replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
function allAttempts(...results) {
return results.flatMap((result) => Array.isArray(result?.attempts) && result.attempts.length ? result.attempts : [result]);
}
function rowReady(report) {
return report.setupStatus === 0 && report.indexStatus === 0 && report.toolCallable && report.nonEmptyPrediction && report.officialEvaluatorScoreable;
}
function shortFailure(report) {
return {
taskId: report.taskId,
repo: report.repo,
setupStatus: report.setupStatus,
indexStatus: report.indexStatus,
indexSignal: report.indexSignal,
indexError: report.indexError,
toolCallable: report.toolCallable,
nonEmptyPrediction: report.nonEmptyPrediction,
officialEvaluatorScoreable: report.officialEvaluatorScoreable,
indexStderrExcerpt: report.indexStderrExcerpt
};
}
const reports = [];
for (const [i, task] of tasks.entries()) {
const dir = join(outRoot, `${i + 1}-${task.instance_id}`);
mkdirSync(dir, { recursive: true });
const env = { ...process.env, CBM_CACHE_DIR: join(dir, 'cbm-cache'), CBM_DIAGNOSTICS: '1' };
const opts = { cwd: task.repo_checkout_path, env, timeoutMs: 120_000 };
const query = queryOf(task.problem_statement);
const firstTerm = query.split(/\s+/)[0] || 'import';
const setup = run(cbm, ['--version'], { env, timeoutMs: 60_000 });
const indexRun = run(cbm, ['cli', 'index_repository', JSON.stringify({ repo_path: task.repo_checkout_path })], { ...opts, timeoutMs: 45 * 60 * 1000 });
const listProjects = firstOk('list_projects', [['cli', 'list_projects'], ['cli', 'list_projects', '{}']], opts);
const project = projectFrom(indexRun, listProjects) || basename(task.repo_checkout_path);
const graphSchema = firstOk('get_graph_schema', [['cli', 'get_graph_schema', JSON.stringify({ project })]], opts);
const graphSearch = firstUseful('search_graph', [
['cli', 'search_graph', JSON.stringify({ project, query, limit: 25 })],
['cli', 'search_graph', JSON.stringify({ project, label: 'Function', query: firstTerm, limit: 25 })],
['cli', 'search_graph', JSON.stringify({ project, label: 'Function', name_pattern: `.*${regexLiteral(firstTerm)}.*`, limit: 25 })],
['cli', 'search_graph', JSON.stringify({ project, label: 'Function', limit: 25 })],
['cli', 'search_graph', JSON.stringify({ project, label: 'Class', limit: 25 })]
], opts);
const codeSearch = firstUseful('search_code', [
['cli', 'search_code', JSON.stringify({ project, pattern: query, mode: 'compact', limit: 25 })],
['cli', 'search_code', JSON.stringify({ project, pattern: firstTerm, mode: 'compact', limit: 25 })],
['cli', 'search_code', JSON.stringify({ project, pattern: '.', mode: 'compact', limit: 25 })]
], opts);
const spans = new Map();
for (const r of allAttempts(listProjects, graphSchema, graphSearch, codeSearch)) addLocationsFrom(r, spans);
const predFiles = [...spans.keys()].slice(0, 20);
const predSpans = Object.fromEntries([...spans.entries()].slice(0, 20));
const predictionPath = join(dir, 'prediction.json');
writeFileSync(predictionPath, JSON.stringify({ instance_id: task.instance_id, repo_url: task.repo_checkout_path, commit: task.base_commit, traj_data: { pred_steps: [{ files: predFiles, spans: predSpans }], pred_files: predFiles, pred_spans: predSpans }, model_patch: '' }, null, 2));
for (const [name, result] of Object.entries({ setup, indexRun, listProjects, graphSchema, graphSearch, codeSearch })) writeFileSync(join(dir, `${name}.json`), JSON.stringify(result, null, 2));
const goldPath = join(dir, 'gold.json');
const gold = run('node', ['scripts/contextbench-select-slice.mjs', '--write-gold', '--task-id', task.instance_id, '--out', goldPath, '--payloads', process.env.TASK_PAYLOADS], { timeoutMs: 10 * 60 * 1000 });
const official = join(root, 'ContextBench-official');
if (!existsSync(join(official, 'contextbench', 'evaluate.py'))) run('git', ['clone', '--depth', '1', 'https://github.com/EuniAI/ContextBench.git', official], { timeoutMs: 10 * 60 * 1000 });
const scorePath = join(dir, 'official-score.jsonl');
const evaluator = run('python', ['-m', 'contextbench.evaluate', '--gold', goldPath, '--pred', predictionPath, '--cache', join(dir, 'repo-cache'), '--out', scorePath], { cwd: official, timeoutMs: 20 * 60 * 1000 });
const scoreText = existsSync(scorePath) ? readFileSync(scorePath, 'utf8').trim() : '';
const report = { taskId: task.instance_id, repo: task.repo, project, setupStatus: setup.status, indexStatus: indexRun.status, indexSignal: indexRun.signal, indexError: indexRun.error, indexStderrExcerpt: indexRun.stderr.slice(0, 1000), toolCallable: [graphSearch, codeSearch].some((r) => r.status === 0), nonEmptyPrediction: predFiles.length > 0 && Object.keys(predSpans).length > 0, officialEvaluatorStatus: evaluator.status, officialEvaluatorScoreable: evaluator.status === 0 && scoreText.length > 0, costs: { setupDurationMs: setup.durationMs, indexDurationMs: indexRun.durationMs, queryDurationMs: listProjects.durationMs + graphSchema.durationMs + graphSearch.durationMs + codeSearch.durationMs, evaluatorDurationMs: evaluator.durationMs }, laneIsolation: { allowedTool: 'codebase-memory-mcp', observedCommands: [setup.command, indexRun.command, listProjects.command, graphSchema.command, graphSearch.command, codeSearch.command], observedCwds: [setup.cwd, indexRun.cwd, listProjects.cwd, graphSchema.cwd, graphSearch.cwd, codeSearch.cwd], disallowedNativeReadSearchUsedForPrediction: false }, query, predFiles, selectedBecause: { graphSearch: graphSearch.selectedBecause, codeSearch: codeSearch.selectedBecause }, commands: { setup, indexRun, listProjects, graphSchema, graphSearch, codeSearch, gold, evaluator } };
writeFileSync(join(dir, 'readiness-report.json'), JSON.stringify(report, null, 2));
reports.push(report);
}
const functionalReports = reports.filter(rowReady);
const blockers = reports.filter((r) => !rowReady(r)).map(shortFailure);
const summary = { createdAt: new Date().toISOString(), lane: 'codebase-memory-mcp', ready: functionalReports.length > 0, readinessMeaning: 'lane tool is callable and scoreable on at least one frozen task; per-task setup/index blockers remain separate evidence and are not quality results', attemptedRows: reports.length, functionalRows: functionalReports.length, blockerRows: blockers.length, scoreableRows: reports.filter((r) => r.officialEvaluatorScoreable).length, nonEmptyPredictionRows: reports.filter((r) => r.nonEmptyPrediction).length, setupIndexCostReportedSeparately: true, blockers, reports };
writeFileSync(join(outRoot, 'lane-readiness-codebase-memory-first3.json'), JSON.stringify(summary, null, 2));
console.log(JSON.stringify({ ready: summary.ready, readinessMeaning: summary.readinessMeaning, attemptedRows: summary.attemptedRows, functionalRows: summary.functionalRows, blockerRows: summary.blockerRows, scoreableRows: summary.scoreableRows, nonEmptyPredictionRows: summary.nonEmptyPredictionRows, blockers: summary.blockers, reports: summary.reports.map((r) => ({ taskId: r.taskId, setupStatus: r.setupStatus, indexStatus: r.indexStatus, indexSignal: r.indexSignal, indexError: r.indexError, toolCallable: r.toolCallable, nonEmptyPrediction: r.nonEmptyPrediction, officialEvaluatorScoreable: r.officialEvaluatorScoreable, predFiles: r.predFiles.length, selectedBecause: r.selectedBecause, costs: r.costs })) }, null, 2));
if (!summary.ready) process.exitCode = 1;
NODE
node "$ROOT/readiness.mjs"
- name: Upload readiness artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: contextbench-cbm-readiness-retry
path: /tmp/contextbench-cbm-readiness
retention-days: 14