|
| 1 | +name: ContextBench CI Recovery |
| 2 | + |
| 3 | +on: |
| 4 | + push: |
| 5 | + branches: [bench/contextbench-ci-recovery-20260510] |
| 6 | + workflow_dispatch: |
| 7 | + inputs: |
| 8 | + max_tasks: |
| 9 | + description: 'Number of first tasks to run for codebase-memory readiness' |
| 10 | + required: true |
| 11 | + default: '3' |
| 12 | + codebase_memory_version: |
| 13 | + description: 'codebase-memory-mcp release tag' |
| 14 | + required: true |
| 15 | + default: 'v0.6.1' |
| 16 | + |
| 17 | +permissions: |
| 18 | + contents: read |
| 19 | + |
| 20 | +jobs: |
| 21 | + codebase-memory-first3-readiness: |
| 22 | + runs-on: ubuntu-latest |
| 23 | + timeout-minutes: 360 |
| 24 | + env: |
| 25 | + CI_READINESS_ROOT: ${{ runner.temp }}/contextbench-readiness |
| 26 | + TASK_PAYLOADS: ${{ runner.temp }}/contextbench-readiness/task-payloads.json |
| 27 | + CHECKOUT_ROOT: ${{ runner.temp }}/contextbench-checkouts |
| 28 | + CBM_VERSION: ${{ github.event.inputs.codebase_memory_version || 'v0.6.1' }} |
| 29 | + MAX_TASKS: ${{ github.event.inputs.max_tasks || '3' }} |
| 30 | + steps: |
| 31 | + - uses: actions/checkout@v4 |
| 32 | + |
| 33 | + - uses: pnpm/action-setup@v2 |
| 34 | + with: |
| 35 | + version: 10 |
| 36 | + |
| 37 | + - uses: actions/setup-node@v4 |
| 38 | + with: |
| 39 | + node-version: '24' |
| 40 | + cache: 'pnpm' |
| 41 | + |
| 42 | + - uses: actions/setup-python@v5 |
| 43 | + with: |
| 44 | + python-version: '3.12' |
| 45 | + |
| 46 | + - name: Install repo dependencies |
| 47 | + run: pnpm install --frozen-lockfile |
| 48 | + |
| 49 | + - name: Validate frozen ContextBench fixtures |
| 50 | + run: node scripts/contextbench-runner.mjs --validate-fixtures |
| 51 | + |
| 52 | + - name: Materialize first task payloads and checkouts |
| 53 | + run: | |
| 54 | + mkdir -p "$CI_READINESS_ROOT" "$CHECKOUT_ROOT" |
| 55 | + node scripts/contextbench-select-slice.mjs --write-task-payloads --out "$TASK_PAYLOADS" --checkout-root "$CHECKOUT_ROOT" |
| 56 | + node scripts/contextbench-select-slice.mjs --materialize-checkouts --payloads "$TASK_PAYLOADS" --max-tasks "$MAX_TASKS" |
| 57 | +
|
| 58 | + - name: Download codebase-memory-mcp |
| 59 | + run: | |
| 60 | + set -euxo pipefail |
| 61 | + mkdir -p "$CI_READINESS_ROOT/tool" |
| 62 | + curl -fsSL "https://github.com/DeusData/codebase-memory-mcp/releases/download/${CBM_VERSION}/codebase-memory-mcp-linux-amd64.tar.gz" -o "$CI_READINESS_ROOT/tool/cbm.tar.gz" |
| 63 | + tar -xzf "$CI_READINESS_ROOT/tool/cbm.tar.gz" -C "$CI_READINESS_ROOT/tool" |
| 64 | + find "$CI_READINESS_ROOT/tool" -maxdepth 3 -type f -name 'codebase-memory-mcp*' -print |
| 65 | + chmod +x "$CI_READINESS_ROOT/tool/codebase-memory-mcp" || true |
| 66 | + "$CI_READINESS_ROOT/tool/codebase-memory-mcp" --version || true |
| 67 | +
|
| 68 | + - name: Run codebase-memory-mcp first3 readiness with official evaluator |
| 69 | + env: |
| 70 | + CBM_BIN: ${{ runner.temp }}/contextbench-readiness/tool/codebase-memory-mcp |
| 71 | + run: | |
| 72 | + cat > "$CI_READINESS_ROOT/codebase-memory-first3-readiness.mjs" <<'NODE' |
| 73 | + import { spawnSync } from 'node:child_process'; |
| 74 | + import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'; |
| 75 | + import { join } from 'node:path'; |
| 76 | +
|
| 77 | + const root = process.env.CI_READINESS_ROOT; |
| 78 | + const payloadPath = process.env.TASK_PAYLOADS; |
| 79 | + const cbmBin = process.env.CBM_BIN; |
| 80 | + const maxTasks = Number(process.env.MAX_TASKS || '3'); |
| 81 | + const payloads = JSON.parse(readFileSync(payloadPath, 'utf8')); |
| 82 | + const tasks = payloads.tasks.slice(0, maxTasks); |
| 83 | + const outRoot = join(root, 'codebase-memory-first3'); |
| 84 | + mkdirSync(outRoot, { recursive: true }); |
| 85 | +
|
| 86 | + function run(command, args, options = {}) { |
| 87 | + const started = Date.now(); |
| 88 | + const result = spawnSync(command, args, { |
| 89 | + encoding: 'utf8', |
| 90 | + timeout: options.timeoutMs ?? 20 * 60 * 1000, |
| 91 | + env: options.env ?? process.env, |
| 92 | + cwd: options.cwd ?? process.cwd(), |
| 93 | + maxBuffer: 64 * 1024 * 1024 |
| 94 | + }); |
| 95 | + return { |
| 96 | + command: [command, ...args].join(' '), |
| 97 | + status: result.status, |
| 98 | + signal: result.signal, |
| 99 | + error: result.error?.message ?? null, |
| 100 | + durationMs: Date.now() - started, |
| 101 | + stdout: result.stdout ?? '', |
| 102 | + stderr: result.stderr ?? '' |
| 103 | + }; |
| 104 | + } |
| 105 | +
|
| 106 | + function parseJsonish(text) { |
| 107 | + const trimmed = String(text || '').trim(); |
| 108 | + if (!trimmed) return null; |
| 109 | + try { return JSON.parse(trimmed); } catch {} |
| 110 | + const first = trimmed.indexOf('{'); |
| 111 | + const last = trimmed.lastIndexOf('}'); |
| 112 | + if (first >= 0 && last > first) { |
| 113 | + try { return JSON.parse(trimmed.slice(first, last + 1)); } catch {} |
| 114 | + } |
| 115 | + return null; |
| 116 | + } |
| 117 | +
|
| 118 | + function collectSpans(value, spans = new Map()) { |
| 119 | + if (!value || typeof value !== 'object') return spans; |
| 120 | + if (Array.isArray(value)) { |
| 121 | + for (const item of value) collectSpans(item, spans); |
| 122 | + return spans; |
| 123 | + } |
| 124 | + const file = value.file || value.path || value.file_path || value.rel_path || value.relative_path; |
| 125 | + const start = value.start_line || value.startLine || value.line || value.line_start || value.start; |
| 126 | + const end = value.end_line || value.endLine || value.line_end || value.end || start; |
| 127 | + if (typeof file === 'string' && file.length > 0 && Number.isFinite(Number(start))) { |
| 128 | + const clean = file.replace(/^\/+/, ''); |
| 129 | + const span = { start: Math.max(1, Number(start)), end: Math.max(Number(start), Number(end) || Number(start)) }; |
| 130 | + const list = spans.get(clean) || []; |
| 131 | + list.push(span); |
| 132 | + spans.set(clean, list); |
| 133 | + } |
| 134 | + for (const item of Object.values(value)) collectSpans(item, spans); |
| 135 | + return spans; |
| 136 | + } |
| 137 | +
|
| 138 | + function makeQuery(problem) { |
| 139 | + return String(problem || '') |
| 140 | + .replace(/[`*_#>\[\](){},.;:!?/\\]/g, ' ') |
| 141 | + .split(/\s+/) |
| 142 | + .filter((w) => w.length >= 4 && !/^https?$/.test(w)) |
| 143 | + .slice(0, 8) |
| 144 | + .join(' '); |
| 145 | + } |
| 146 | +
|
| 147 | + const reports = []; |
| 148 | + let ready = true; |
| 149 | + for (const [index, task] of tasks.entries()) { |
| 150 | + const runDir = join(outRoot, `${index + 1}-${task.instance_id}`); |
| 151 | + mkdirSync(runDir, { recursive: true }); |
| 152 | + const env = { |
| 153 | + ...process.env, |
| 154 | + CBM_CACHE_DIR: join(runDir, 'cbm-cache'), |
| 155 | + CBM_DIAGNOSTICS: '1' |
| 156 | + }; |
| 157 | + const setup = run(cbmBin, ['--version'], { env, timeoutMs: 60_000 }); |
| 158 | + const indexRun = run(cbmBin, ['cli', 'index_repository', JSON.stringify({ repo_path: task.repo_checkout_path })], { env, timeoutMs: 45 * 60 * 1000 }); |
| 159 | + const listProjects = run(cbmBin, ['cli', '--raw', 'list_projects', '{}'], { env, timeoutMs: 120_000 }); |
| 160 | + const graphSchema = run(cbmBin, ['cli', '--raw', 'get_graph_schema', '{}'], { env, timeoutMs: 120_000 }); |
| 161 | + const graphSearch = run(cbmBin, ['cli', '--raw', 'search_graph', JSON.stringify({ label: 'Function', limit: 25 })], { env, timeoutMs: 120_000 }); |
| 162 | + const query = makeQuery(task.problem_statement); |
| 163 | + const codeSearch = run(cbmBin, ['cli', '--raw', 'search_code', JSON.stringify({ query, output: 'compact', limit: 25 })], { env, timeoutMs: 120_000 }); |
| 164 | +
|
| 165 | + const parsed = [codeSearch.stdout, graphSearch.stdout, graphSchema.stdout, listProjects.stdout] |
| 166 | + .map(parseJsonish) |
| 167 | + .filter(Boolean); |
| 168 | + const spans = new Map(); |
| 169 | + for (const parsedOutput of parsed) collectSpans(parsedOutput, spans); |
| 170 | + const predFiles = [...spans.keys()].slice(0, 20); |
| 171 | + const predSpans = Object.fromEntries([...spans.entries()].slice(0, 20)); |
| 172 | + const prediction = { |
| 173 | + instance_id: task.instance_id, |
| 174 | + repo_url: task.repo_checkout_path, |
| 175 | + commit: task.base_commit, |
| 176 | + traj_data: { |
| 177 | + pred_steps: [{ files: predFiles, spans: predSpans }], |
| 178 | + pred_files: predFiles, |
| 179 | + pred_spans: predSpans |
| 180 | + }, |
| 181 | + model_patch: '' |
| 182 | + }; |
| 183 | + writeFileSync(join(runDir, 'prediction.json'), JSON.stringify(prediction, null, 2)); |
| 184 | + writeFileSync(join(runDir, 'setup.stdout.log'), setup.stdout); |
| 185 | + writeFileSync(join(runDir, 'setup.stderr.log'), setup.stderr); |
| 186 | + writeFileSync(join(runDir, 'index.stdout.log'), indexRun.stdout); |
| 187 | + writeFileSync(join(runDir, 'index.stderr.log'), indexRun.stderr); |
| 188 | + writeFileSync(join(runDir, 'list-projects.stdout.log'), listProjects.stdout); |
| 189 | + writeFileSync(join(runDir, 'graph-schema.stdout.log'), graphSchema.stdout); |
| 190 | + writeFileSync(join(runDir, 'graph-search.stdout.log'), graphSearch.stdout); |
| 191 | + writeFileSync(join(runDir, 'code-search.stdout.log'), codeSearch.stdout); |
| 192 | +
|
| 193 | + const goldPath = join(runDir, 'gold.json'); |
| 194 | + const gold = run('node', ['scripts/contextbench-select-slice.mjs', '--write-gold', '--task-id', task.instance_id, '--out', goldPath, '--payloads', payloadPath], { timeoutMs: 10 * 60 * 1000 }); |
| 195 | + const officialRepo = join(root, 'ContextBench-official'); |
| 196 | + if (!existsSync(join(officialRepo, 'contextbench', 'evaluate.py'))) { |
| 197 | + run('git', ['clone', '--depth', '1', 'https://github.com/EuniAI/ContextBench.git', officialRepo], { timeoutMs: 10 * 60 * 1000 }); |
| 198 | + } |
| 199 | + const scorePath = join(runDir, 'official-score.jsonl'); |
| 200 | + const evaluator = run('python', ['-m', 'contextbench.evaluate', '--gold', goldPath, '--pred', join(runDir, 'prediction.json'), '--cache', join(runDir, 'repo-cache'), '--out', scorePath], { cwd: officialRepo, timeoutMs: 20 * 60 * 1000 }); |
| 201 | + const scoreable = evaluator.status === 0 && existsSync(scorePath); |
| 202 | + const nonEmptyPrediction = predFiles.length > 0 && Object.keys(predSpans).length > 0; |
| 203 | + const laneIsolation = { |
| 204 | + allowedTool: 'codebase-memory-mcp', |
| 205 | + observedCommands: [setup.command, indexRun.command, listProjects.command, graphSchema.command, graphSearch.command, codeSearch.command], |
| 206 | + disallowedNativeReadSearchUsedForPrediction: false, |
| 207 | + note: 'Prediction spans are derived only from codebase-memory-mcp CLI JSON/stdout outputs.' |
| 208 | + }; |
| 209 | + const report = { |
| 210 | + taskId: task.instance_id, |
| 211 | + repo: task.repo, |
| 212 | + setupStatus: setup.status, |
| 213 | + indexStatus: indexRun.status, |
| 214 | + toolCallable: [listProjects, graphSchema, graphSearch, codeSearch].some((r) => r.status === 0), |
| 215 | + nonEmptyPrediction, |
| 216 | + officialEvaluatorStatus: evaluator.status, |
| 217 | + officialEvaluatorScoreable: scoreable, |
| 218 | + laneIsolation, |
| 219 | + costs: { |
| 220 | + setupDurationMs: setup.durationMs, |
| 221 | + indexDurationMs: indexRun.durationMs, |
| 222 | + queryDurationMs: listProjects.durationMs + graphSchema.durationMs + graphSearch.durationMs + codeSearch.durationMs, |
| 223 | + evaluatorDurationMs: evaluator.durationMs |
| 224 | + }, |
| 225 | + query, |
| 226 | + predFiles, |
| 227 | + commands: { setup, indexRun, listProjects, graphSchema, graphSearch, codeSearch, gold, evaluator } |
| 228 | + }; |
| 229 | + writeFileSync(join(runDir, 'readiness-report.json'), JSON.stringify(report, null, 2)); |
| 230 | + reports.push(report); |
| 231 | + if (!(setup.status === 0 && indexRun.status === 0 && report.toolCallable && nonEmptyPrediction && scoreable)) ready = false; |
| 232 | + } |
| 233 | +
|
| 234 | + const summary = { |
| 235 | + createdAt: new Date().toISOString(), |
| 236 | + lane: 'codebase-memory-mcp', |
| 237 | + ready, |
| 238 | + attemptedRows: reports.length, |
| 239 | + scoreableRows: reports.filter((r) => r.officialEvaluatorScoreable).length, |
| 240 | + nonEmptyPredictionRows: reports.filter((r) => r.nonEmptyPrediction).length, |
| 241 | + setupIndexCostReportedSeparately: true, |
| 242 | + reports |
| 243 | + }; |
| 244 | + writeFileSync(join(outRoot, 'lane-readiness-codebase-memory-first3.json'), JSON.stringify(summary, null, 2)); |
| 245 | + console.log(JSON.stringify(summary, null, 2)); |
| 246 | + if (!ready) process.exitCode = 1; |
| 247 | + NODE |
| 248 | + node "$CI_READINESS_ROOT/codebase-memory-first3-readiness.mjs" |
| 249 | +
|
| 250 | + - name: Upload ContextBench recovery artifacts |
| 251 | + if: always() |
| 252 | + uses: actions/upload-artifact@v4 |
| 253 | + with: |
| 254 | + name: contextbench-codebase-memory-first3-readiness |
| 255 | + path: ${{ runner.temp }}/contextbench-readiness |
| 256 | + retention-days: 14 |
0 commit comments