Skip to content

Commit 771f384

Browse files
sjarmakclaude
andcommitted
Add 23 scaling gap tasks from 11 new repos (381→404 tasks)
Fill repo size distribution gaps for scaling analysis: - 500MB-1GB tier: 24→35 tasks, 10 unique repos (grpc, tidb, ceph, beam, bazel) - >1.5GB XL tier: 21→33 tasks, 12 unique repos (elasticsearch, godot, roslyn, typescript, cockroach, clickhouse) 11 SDLC tasks (debug/document/refactor/secure/test/design/understand) 12 Org tasks (platform/compliance/security/crossorg/migration/org/crossrepo_tracing) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 58df5ae commit 771f384

File tree

247 files changed

+25014
-93
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

247 files changed

+25014
-93
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
FROM ubuntu:22.04
2+
3+
ENV DEBIAN_FRONTEND=noninteractive
4+
5+
# Base tools
6+
RUN apt-get update && apt-get install -y --no-install-recommends \
7+
git \
8+
ca-certificates \
9+
curl \
10+
python3 \
11+
golang-go \
12+
&& rm -rf /var/lib/apt/lists/*
13+
14+
WORKDIR /workspace
15+
16+
# Clone local checkout repos (baseline config: agent has local access to these)
17+
RUN git clone --depth 1 https://github.com/sg-evals/tidb--v8.5.0 /workspace/tidb--v8.5.0
18+
19+
# Initialize git identity for agent commits
20+
RUN git config --global user.email "agent@example.com" && \
21+
git config --global user.name "Agent" && \
22+
git config --global safe.directory '*'
23+
24+
# Create log directories
25+
RUN mkdir -p /logs/agent /logs/verifier
26+
27+
# Pre-create claude user and set ownership at build time so Harbor's
28+
# runtime chown is a no-op (avoids 15-30 min delay on large repos).
29+
RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
30+
for d in /workspace /app /testbed /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
31+
32+
ENTRYPOINT []
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# ccx-compliance-286 — artifact_baseline variant
2+
# Baseline with local code + artifact mode (verifier parses answer.json).
3+
4+
FROM ubuntu:22.04
5+
6+
ENV DEBIAN_FRONTEND=noninteractive
7+
8+
# Base tools
9+
RUN apt-get update && apt-get install -y --no-install-recommends \
10+
git \
11+
ca-certificates \
12+
curl \
13+
python3 \
14+
golang-go \
15+
&& rm -rf /var/lib/apt/lists/*
16+
17+
WORKDIR /workspace
18+
19+
# Clone local checkout repos (baseline config: agent has local access to these)
20+
RUN git clone --depth 1 https://github.com/sg-evals/tidb--v8.5.0 /workspace/tidb--v8.5.0
21+
22+
# Initialize git identity for agent commits
23+
RUN git config --global user.email "agent@example.com" && \
24+
git config --global user.name "Agent" && \
25+
git config --global safe.directory '*'
26+
27+
# Create log directories
28+
RUN mkdir -p /logs/agent /logs/verifier
29+
30+
# Pre-create claude user and set ownership at build time so Harbor's
31+
# runtime chown is a no-op (avoids 15-30 min delay on large repos).
32+
RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
33+
for d in /workspace /app /testbed /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
34+
35+
# Mark artifact-only mode — verifier parses answer.json
36+
RUN touch /tmp/.artifact_only_mode
37+
38+
ENTRYPOINT []
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# ccx-compliance-286 — artifact_only variant
2+
# No local repo clone — agent uses Sourcegraph MCP exclusively for code access.
3+
# Agent produces answer.json artifact; verifier scores the artifact.
4+
5+
FROM ubuntu:22.04
6+
7+
ENV DEBIAN_FRONTEND=noninteractive
8+
ENV SOURCEGRAPH_REPOS="sg-evals/tidb--v8.5.0"
9+
10+
RUN apt-get update && apt-get install -y --no-install-recommends \
11+
git \
12+
ca-certificates \
13+
python3 \
14+
curl \
15+
&& rm -rf /var/lib/apt/lists/*
16+
17+
WORKDIR /workspace
18+
19+
# Empty workspace — agent discovers code via MCP tools only
20+
RUN git init && \
21+
git config user.email "agent@example.com" && \
22+
git config user.name "Agent" && \
23+
git config --global safe.directory '*'
24+
25+
# Create log directories
26+
RUN mkdir -p /logs/agent /logs/verifier
27+
28+
# Mark artifact-only mode — verifiers and eval scripts check this flag
29+
RUN touch /tmp/.artifact_only_mode
30+
31+
# Pre-create claude user and set ownership at build time.
32+
RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
33+
for d in /workspace /app /testbed /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
34+
35+
ENTRYPOINT []
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# ccx-compliance-286 — sg_only_env variant (v2: clone-at-verify)
2+
# Empty workspace — agent uses Sourcegraph MCP for code access.
3+
# Verifier clones mirror(s) at verification time via clone manifest.
4+
5+
FROM ubuntu:22.04
6+
7+
ENV DEBIAN_FRONTEND=noninteractive
8+
ENV SOURCEGRAPH_REPOS="sg-evals/tidb--v8.5.0"
9+
10+
RUN apt-get update && apt-get install -y --no-install-recommends \
11+
git \
12+
ca-certificates \
13+
python3 \
14+
curl \
15+
&& rm -rf /var/lib/apt/lists/*
16+
17+
WORKDIR /workspace
18+
19+
# Empty git repo so agent can commit work
20+
RUN git init && \
21+
git config user.email "agent@example.com" && \
22+
git config user.name "Agent"
23+
24+
RUN mkdir -p /logs/agent /logs/verifier
25+
26+
# Clone manifest for verifier (clone-at-verify strategy)
27+
RUN echo '{"workdir":"/workspace","repos":[{"mirror":"sg-evals/tidb--v8.5.0","target_dir":"tidb--v8.5.0"}]}' > /tmp/.sg_only_clone_manifest.json
28+
29+
# Mark sg_only mode
30+
RUN touch /tmp/.sg_only_mode
31+
32+
# Pre-create claude user and set ownership at build time.
33+
RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
34+
for d in /workspace /app /testbed /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
35+
36+
ENTRYPOINT []
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# TiDB SQL Privilege Check and RBAC Enforcement Audit
2+
3+
## Your Task
4+
5+
Audit the privilege and RBAC enforcement in pingcap/tidb. Find all Go source files under pkg/privilege/ and pkg/planner/ that implement privilege checking for SQL statements. Identify: the PrivilegeManager interface, the MySQLPrivilege cache, the RequestVerification method and its callers, the role graph resolution logic, and any files that map SQL statement types to required privileges. For each file report the path and its role in the privilege check flow.
6+
7+
## Context
8+
9+
You are working on a codebase task involving repos from the compliance domain.
10+
11+
## Available Resources
12+
13+
The local `/workspace/` directory contains: sg-evals/tidb--v8.5.0.
14+
15+
## Output Format
16+
17+
Create a file at `/workspace/answer.json` with your findings in the following structure:
18+
19+
```json
20+
{
21+
"files": [
22+
{"repo": "org/repo-name", "path": "relative/path/to/file.go"}
23+
],
24+
"symbols": [
25+
{"repo": "org/repo-name", "path": "relative/path/to/file.go", "symbol": "SymbolName"}
26+
],
27+
"chain": [
28+
{"repo": "org/repo-name", "path": "relative/path/to/file.go", "symbol": "FunctionName"}
29+
],
30+
"text": "Narrative explanation of your findings, citing repos and file paths."
31+
}
32+
```
33+
34+
Include only the fields relevant to this task. Your answer is evaluated against a closed-world oracle — completeness matters.
35+
36+
## Evaluation
37+
38+
Your answer will be scored on:
39+
- **File recall and precision**: Did you find all relevant files?
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# IMPORTANT: Source Code Access
2+
3+
**Local source files are not present.** Your workspace does not contain source code. You **MUST** use Sourcegraph MCP tools to discover, read, and understand code before making any changes.
4+
5+
**Target Repositories (version-pinned mirrors):**
6+
7+
- `github.com/sg-evals/tidb--v8.5.0` — use `repo:^github.com/sg-evals/tidb--v8.5.0$` filter
8+
9+
Scope ALL keyword_search/nls_search queries to these repos.
10+
Use the repo name as the `repo` parameter for read_file/go_to_definition/find_references.
11+
12+
13+
## Required Workflow
14+
15+
1. **Search first** — Use MCP tools to find relevant files and understand existing patterns
16+
2. **Read remotely** — Use `sg_read_file` to read full file contents from Sourcegraph
17+
3. **Edit locally** — Use Edit, Write, and Bash to create or modify files in your working directory
18+
4. **Verify locally** — Run tests with Bash to check your changes
19+
20+
## Tool Selection
21+
22+
| Goal | Tool |
23+
|------|------|
24+
| Exact symbol/string | `sg_keyword_search` |
25+
| Concepts/semantic search | `sg_nls_search` |
26+
| Trace usage/callers | `sg_find_references` |
27+
| See implementation | `sg_go_to_definition` |
28+
| Read full file | `sg_read_file` |
29+
| Browse structure | `sg_list_files` |
30+
| Find repos | `sg_list_repos` |
31+
| Search commits | `sg_commit_search` |
32+
| Track changes | `sg_diff_search` |
33+
| Compare versions | `sg_compare_revisions` |
34+
35+
**Decision logic:**
36+
1. Know the exact symbol? → `sg_keyword_search`
37+
2. Know the concept, not the name? → `sg_nls_search`
38+
3. Need definition of a symbol? → `sg_go_to_definition`
39+
4. Need all callers/references? → `sg_find_references`
40+
5. Need full file content? → `sg_read_file`
41+
42+
## Scoping (Always Do This)
43+
44+
```
45+
repo:^github.com/ORG/REPO$ # Exact repo (preferred)
46+
repo:github.com/ORG/ # All repos in org
47+
file:.*\.ts$ # TypeScript only
48+
file:src/api/ # Specific directory
49+
```
50+
51+
Start narrow. Expand only if results are empty.
52+
53+
## Efficiency Rules
54+
55+
- Chain searches logically: search → read → references → definition
56+
- Don't re-search for the same pattern; use results from prior calls
57+
- Prefer `sg_keyword_search` over `sg_nls_search` when you have exact terms
58+
- Read 2-3 related files before synthesising, rather than one at a time
59+
- Don't read 20+ remote files without writing code — once you understand the pattern, start implementing
60+
61+
## If Stuck
62+
63+
If MCP search returns no results:
64+
1. Broaden the search query (synonyms, partial identifiers)
65+
2. Try `sg_nls_search` for semantic matching
66+
3. Use `sg_list_files` to browse the directory structure
67+
4. Use `sg_list_repos` to verify the repository name
68+
69+
---
70+
71+
**Sourcegraph Repositories:** `github.com/sg-evals/tidb--v8.5.0`
72+
73+
# TiDB SQL Privilege Check and RBAC Enforcement Audit
74+
75+
## Your Task
76+
77+
Audit the privilege and RBAC enforcement in pingcap/tidb. Find all Go source files under pkg/privilege/ and pkg/planner/ that implement privilege checking for SQL statements. Identify: the PrivilegeManager interface, the MySQLPrivilege cache, the RequestVerification method and its callers, the role graph resolution logic, and any files that map SQL statement types to required privileges. For each file report the path and its role in the privilege check flow.
78+
79+
## Context
80+
81+
You are working on a codebase task involving repos from the compliance domain.
82+
83+
## Available Resources
84+
85+
The local `/workspace/` directory contains: sg-evals/tidb--v8.5.0.
86+
87+
## Output Format
88+
89+
Create a file at `/workspace/answer.json` with your findings in the following structure:
90+
91+
```json
92+
{
93+
"files": [
94+
{"repo": "org/repo-name", "path": "relative/path/to/file.go"}
95+
],
96+
"symbols": [
97+
{"repo": "org/repo-name", "path": "relative/path/to/file.go", "symbol": "SymbolName"}
98+
],
99+
"chain": [
100+
{"repo": "org/repo-name", "path": "relative/path/to/file.go", "symbol": "FunctionName"}
101+
],
102+
"text": "Narrative explanation of your findings, citing repos and file paths."
103+
}
104+
```
105+
106+
Include only the fields relevant to this task. Your answer is evaluated against a closed-world oracle — completeness matters.
107+
108+
## Evaluation
109+
110+
Your answer will be scored on:
111+
- **File recall and precision**: Did you find all relevant files?
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
version = "1.0"
2+
3+
[metadata]
4+
name = "CCX-compliance-286"
5+
description = "TiDB SQL Privilege Check and RBAC Enforcement Audit"
6+
license = "Apache-2.0"
7+
8+
[task]
9+
id = "CCX-compliance-286"
10+
repo = "sg-evals/tidb--v8.5.0"
11+
category = "compliance-audit"
12+
language = "go"
13+
difficulty = "hard"
14+
time_limit_sec = 900
15+
mcp_suite = "csb_org_compliance"
16+
use_case_id = 286
17+
repo_set_id = "tidb-database"
18+
mcp_unique = true
19+
verification_modes = ["artifact"]
20+
21+
[verification]
22+
type = "test"
23+
command = "bash /tests/test.sh"
24+
25+
reward_type = "score"
26+
description = "TiDB SQL Privilege Check and RBAC Enforcement Audit"
27+
28+
[environment]
29+
build_timeout_sec = 600.0
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/bin/bash
2+
# eval.sh — MCP-unique benchmark evaluator for CCX-compliance-286
3+
# Exit-code-first (SWE-Factory pattern):
4+
# exit 0 — agent produced useful output (composite score > 0)
5+
# exit 1 — total failure (composite score == 0 or missing answer)
6+
#
7+
# Writes /logs/verifier/reward.txt with the composite score [0.0, 1.0]
8+
9+
set -euo pipefail
10+
11+
TASK_ID="CCX-compliance-286"
12+
ANSWER_PATH="/workspace/answer.json"
13+
TASK_SPEC_PATH="/tests/task_spec.json"
14+
ORACLE_CHECKS="/tests/oracle_checks.py"
15+
REWARD_PATH="/logs/verifier/reward.txt"
16+
17+
mkdir -p /logs/verifier
18+
19+
echo "=== CCX-compliance-286 evaluator ==="
20+
echo "Task spec: $TASK_SPEC_PATH"
21+
echo "Answer: $ANSWER_PATH"
22+
echo ""
23+
24+
# sg_only mode guard: restore full repo if verifier wrapper exists
25+
if [ -f /tmp/.sg_only_mode ] && [ -f /tests/sgonly_verifier_wrapper.sh ]; then
26+
echo "sg_only mode: sourcing verifier wrapper..."
27+
source /tests/sgonly_verifier_wrapper.sh
28+
fi
29+
30+
# Verify answer file exists
31+
if [ ! -f "$ANSWER_PATH" ]; then
32+
echo "ERROR: answer.json not found at $ANSWER_PATH"
33+
echo "0.0" > "$REWARD_PATH"
34+
exit 1
35+
fi
36+
37+
# Validate answer is valid JSON
38+
if ! python3 -c "import json; json.load(open('$ANSWER_PATH'))" 2>/dev/null; then
39+
echo "ERROR: answer.json is not valid JSON"
40+
echo "0.0" > "$REWARD_PATH"
41+
exit 1
42+
fi
43+
44+
echo "answer.json found and valid JSON"
45+
46+
# Run oracle checks
47+
if [ ! -f "$ORACLE_CHECKS" ]; then
48+
echo "ERROR: oracle_checks.py not found at $ORACLE_CHECKS"
49+
echo "0.0" > "$REWARD_PATH"
50+
exit 1
51+
fi
52+
53+
echo "Running oracle checks..."
54+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
55+
56+
# Validate score is a number
57+
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then
58+
echo "ERROR: oracle_checks.py did not return a valid score: $SCORE"
59+
echo "0.0" > "$REWARD_PATH"
60+
exit 1
61+
fi
62+
63+
echo ""
64+
echo "Composite score: $SCORE"
65+
echo "$SCORE" > "$REWARD_PATH"
66+
67+
# Exit based on score (SWE-Factory exit-code-first pattern)
68+
python3 -c "import sys; sys.exit(0 if float('$SCORE') > 0 else 1)"

0 commit comments

Comments
 (0)