4646 - name : Install repo dependencies
4747 run : pnpm install --frozen-lockfile
4848
49+ - name : Install official evaluator dependencies
50+ run : python -m pip install tree-sitter tree-sitter-languages
51+
4952 - name : Validate frozen ContextBench fixtures
5053 run : node scripts/contextbench-runner.mjs --validate-fixtures
5154
@@ -156,11 +159,11 @@ jobs:
156159 };
157160 const setup = run(cbmBin, ['--version'], { env, timeoutMs: 60_000 });
158161 const indexRun = run(cbmBin, ['cli', 'index_repository', JSON.stringify({ repo_path: task.repo_checkout_path })], { env, timeoutMs: 45 * 60 * 1000 });
159- const listProjects = run(cbmBin, ['cli', '--raw', ' list_projects', '{} '], { env, timeoutMs: 120_000 });
160- const graphSchema = run(cbmBin, ['cli', '--raw', ' get_graph_schema', '{} '], { env, timeoutMs: 120_000 });
161- const graphSearch = run(cbmBin, ['cli', '--raw', ' search_graph', JSON.stringify({ label: 'Function', limit: 25 })], { env, timeoutMs: 120_000 });
162+ const listProjects = run(cbmBin, ['cli', 'list_projects'], { env, timeoutMs: 120_000 });
163+ const graphSchema = run(cbmBin, ['cli', 'get_graph_schema'], { env, timeoutMs: 120_000 });
164+ const graphSearch = run(cbmBin, ['cli', 'search_graph', JSON.stringify({ label: 'Function', limit: 25 })], { env, timeoutMs: 120_000 });
162165 const query = makeQuery(task.problem_statement);
163- const codeSearch = run(cbmBin, ['cli', '--raw', ' search_code', JSON.stringify({ query, output: 'compact', limit: 25 })], { env, timeoutMs: 120_000 });
166+ const codeSearch = run(cbmBin, ['cli', 'search_code', JSON.stringify({ query, output: 'compact', limit: 25 })], { env, timeoutMs: 120_000 });
164167
165168 const parsed = [codeSearch.stdout, graphSearch.stdout, graphSchema.stdout, listProjects.stdout]
166169 .map(parseJsonish)
@@ -186,9 +189,13 @@ jobs:
186189 writeFileSync(join(runDir, 'index.stdout.log'), indexRun.stdout);
187190 writeFileSync(join(runDir, 'index.stderr.log'), indexRun.stderr);
188191 writeFileSync(join(runDir, 'list-projects.stdout.log'), listProjects.stdout);
192+ writeFileSync(join(runDir, 'list-projects.stderr.log'), listProjects.stderr);
189193 writeFileSync(join(runDir, 'graph-schema.stdout.log'), graphSchema.stdout);
194+ writeFileSync(join(runDir, 'graph-schema.stderr.log'), graphSchema.stderr);
190195 writeFileSync(join(runDir, 'graph-search.stdout.log'), graphSearch.stdout);
196+ writeFileSync(join(runDir, 'graph-search.stderr.log'), graphSearch.stderr);
191197 writeFileSync(join(runDir, 'code-search.stdout.log'), codeSearch.stdout);
198+ writeFileSync(join(runDir, 'code-search.stderr.log'), codeSearch.stderr);
192199
193200 const goldPath = join(runDir, 'gold.json');
194201 const gold = run('node', ['scripts/contextbench-select-slice.mjs', '--write-gold', '--task-id', task.instance_id, '--out', goldPath, '--payloads', payloadPath], { timeoutMs: 10 * 60 * 1000 });
@@ -204,7 +211,7 @@ jobs:
204211 allowedTool: 'codebase-memory-mcp',
205212 observedCommands: [setup.command, indexRun.command, listProjects.command, graphSchema.command, graphSearch.command, codeSearch.command],
206213 disallowedNativeReadSearchUsedForPrediction: false,
207- note: 'Prediction spans are derived only from codebase-memory-mcp CLI JSON/stdout outputs.'
214+ note: 'Prediction spans are derived only from codebase-memory-mcp CLI outputs.'
208215 };
209216 const report = {
210217 taskId: task.instance_id,
0 commit comments