Skip to content

Commit 11d1c69

Browse files
sjarmakclaude
andcommitted
feat: normalize all benchmark suites to 20 tasks each
- Trim ccb_fix: 25 → 20 (removed 3 pytorch + 2 openlibrary tasks) - Dropped: pytorch-release-210, pytorch-relu-gelu-fusion, pytorch-tracer-graph-cleanup - Dropped: openlibrary-fntocli-adapter, openlibrary-search-query - Trim ccb_mcp_onboarding: 25 → 20 (removed search-210..214) - Add 130 new mcp-unique tasks (use_case IDs 142-271) across all 11 ccb_mcp_* suites: - ccb_mcp_incident: 11 → 20 (+9 new tasks) - ccb_mcp_domain: 10 → 20 (+10) - ccb_mcp_security: 10 → 20 (+10) - ccb_mcp_crossrepo_tracing: 9 → 20 (+11) - ccb_mcp_compliance: 7 → 20 (+13) - ccb_mcp_migration: 7 → 20 (+13) - ccb_mcp_crossorg: 5 → 20 (+15) - ccb_mcp_org: 5 → 20 (+15) - ccb_mcp_platform: 5 → 20 (+15) - ccb_mcp_crossrepo: 1 → 20 (+19) - All new tasks use artifact validation; oracle_answer.json pending curation - All 11 mcp_* suites + 9 SDLC suites now have exactly 20 tasks each - Total: 284 → 414 tasks (130 new mcp-unique stubs awaiting oracle curation) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent fe4a526 commit 11d1c69

File tree

1,434 files changed

+154655
-7742
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,434 files changed

+154655
-7742
lines changed
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
FROM ubuntu:22.04
2+
3+
ENV DEBIAN_FRONTEND=noninteractive
4+
5+
# Base tools
6+
RUN apt-get update && apt-get install -y --no-install-recommends \
7+
git \
8+
ca-certificates \
9+
curl \
10+
python3 \
11+
golang-go \
12+
&& rm -rf /var/lib/apt/lists/*
13+
14+
WORKDIR /workspace
15+
16+
# Clone local checkout repos (baseline config: agent has local access to these)
17+
RUN git clone --depth 1 https://github.com/sg-evals/kubernetes--v1.32.0 /workspace/kubernetes--v1.32.0
18+
RUN git clone --depth 1 https://github.com/sg-evals/client-go--v0.32.0 /workspace/client-go--v0.32.0
19+
RUN git clone --depth 1 https://github.com/sg-evals/api--v0.32.0 /workspace/api--v0.32.0
20+
RUN git clone --depth 1 https://github.com/sg-evals/etcd-io-etcd /workspace/etcd-io-etcd
21+
22+
# Initialize git identity for agent commits
23+
RUN git config --global user.email "agent@example.com" && \
24+
git config --global user.name "Agent" && \
25+
git config --global safe.directory '*'
26+
27+
# Create log directories
28+
RUN mkdir -p /logs/agent /logs/verifier
29+
30+
# Pre-create claude user and set ownership at build time so Harbor's
31+
# runtime chown is a no-op (avoids 15-30 min delay on large repos).
32+
RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
33+
for d in /workspace /app /testbed /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
34+
35+
ENTRYPOINT []
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# CCX-compliance-182 — artifact_only variant
2+
# No local repo clone — agent uses Sourcegraph MCP exclusively for code access.
3+
# Agent produces answer.json artifact; verifier scores the artifact.
4+
5+
FROM ubuntu:22.04
6+
7+
ENV DEBIAN_FRONTEND=noninteractive
8+
9+
RUN apt-get update && apt-get install -y --no-install-recommends \
10+
git \
11+
ca-certificates \
12+
python3 \
13+
curl \
14+
&& rm -rf /var/lib/apt/lists/*
15+
16+
WORKDIR /workspace
17+
18+
# Empty workspace — agent discovers code via MCP tools only
19+
RUN git init && \
20+
git config user.email "agent@example.com" && \
21+
git config user.name "Agent" && \
22+
git config --global safe.directory '*'
23+
24+
# Create log directories
25+
RUN mkdir -p /logs/agent /logs/verifier
26+
27+
# Mark artifact-only mode — verifiers and eval scripts check this flag
28+
RUN touch /tmp/.artifact_only_mode
29+
30+
# Pre-create claude user and set ownership at build time.
31+
RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
32+
for d in /workspace /app /testbed /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
33+
34+
ENTRYPOINT []
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# CCX-compliance-182 — sg_only variant
2+
# No local repo clone — agent uses Sourcegraph MCP exclusively for code access.
3+
# The verifier clones mirror repos at verification time (no /repo_full/ backup).
4+
5+
FROM ubuntu:22.04
6+
7+
ENV DEBIAN_FRONTEND=noninteractive
8+
9+
RUN apt-get update && apt-get install -y --no-install-recommends \
10+
git \
11+
ca-certificates \
12+
python3 \
13+
curl \
14+
&& rm -rf /var/lib/apt/lists/*
15+
16+
WORKDIR /workspace
17+
18+
# Empty workspace — agent discovers code via MCP tools only
19+
RUN git init && \
20+
git config user.email "agent@example.com" && \
21+
git config user.name "Agent" && \
22+
git config --global safe.directory '*'
23+
24+
# Create log directories
25+
RUN mkdir -p /logs/agent /logs/verifier
26+
27+
# Mark sg_only mode — verifiers and eval scripts check this flag
28+
RUN touch /tmp/.sg_only_mode
29+
30+
# Pre-create claude user and set ownership at build time so Harbor's
31+
# runtime chown is a no-op (avoids 15-30 min delay on large repos).
32+
RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
33+
for d in /workspace /app /testbed /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
34+
35+
ENTRYPOINT []
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Kubernetes RBAC ClusterRole Verb Coverage Audit
2+
3+
## Your Task
4+
5+
Audit kubernetes/kubernetes for completeness of RBAC verb handling. Find all Go source files that define the set of allowed verbs (get, list, watch, create, update, patch, delete, deletecollection) and validate that all API handlers enforce these verb constraints.
6+
7+
## Context
8+
9+
You are working on a codebase task involving repos from the compliance domain.
10+
11+
## Available Resources
12+
13+
The local `/workspace/` directory contains: sg-evals/kubernetes--v1.32.0, sg-evals/client-go--v0.32.0, sg-evals/api--v0.32.0, sg-evals/etcd-io-etcd.
14+
15+
**Note:** Additional repositories are accessible via Sourcegraph MCP tools:
16+
- `sg-evals/etcd-io-etcd` (etcd-io/etcd)
17+
18+
## Output Format
19+
20+
Create a file at `/workspace/answer.json` with your findings in the following structure:
21+
22+
```json
23+
{
24+
"files": [
25+
{"repo": "org/repo-name", "path": "relative/path/to/file.go"}
26+
],
27+
"symbols": [
28+
{"repo": "org/repo-name", "path": "relative/path/to/file.go", "symbol": "SymbolName"}
29+
],
30+
"chain": [
31+
{"repo": "org/repo-name", "path": "relative/path/to/file.go", "symbol": "FunctionName"}
32+
],
33+
"text": "Narrative explanation of your findings, citing repos and file paths."
34+
}
35+
```
36+
37+
Include only the fields relevant to this task. Your answer is evaluated against a closed-world oracle — completeness matters.
38+
39+
## Evaluation
40+
41+
Your answer will be scored on:
42+
- **File recall and precision**: Did you find all relevant files?
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
version = "1.0"
2+
3+
[metadata]
4+
name = "CCX-compliance-182"
5+
description = "Kubernetes RBAC ClusterRole Verb Coverage Audit"
6+
license = "Apache-2.0"
7+
8+
[task]
9+
id = "CCX-compliance-182"
10+
repo = "sg-evals/kubernetes--v1.32.0"
11+
category = "compliance-audit"
12+
language = "go"
13+
difficulty = "hard"
14+
time_limit_sec = 900
15+
mcp_suite = "ccb_mcp_compliance"
16+
use_case_id = 182
17+
repo_set_id = "kubernetes-ecosystem"
18+
mcp_unique = true
19+
verification_modes = ["artifact"]
20+
21+
[verification]
22+
type = "test"
23+
command = "bash /tests/test.sh"
24+
25+
reward_type = "score"
26+
description = "Kubernetes RBAC ClusterRole Verb Coverage Audit"
27+
28+
[environment]
29+
build_timeout_sec = 600.0
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/bin/bash
2+
# eval.sh — MCP-unique benchmark evaluator for CCX-compliance-182
3+
# Exit-code-first (SWE-Factory pattern):
4+
# exit 0 — agent produced useful output (composite score > 0)
5+
# exit 1 — total failure (composite score == 0 or missing answer)
6+
#
7+
# Writes /logs/verifier/reward.txt with the composite score [0.0, 1.0]
8+
9+
set -euo pipefail
10+
11+
TASK_ID="CCX-compliance-182"
12+
ANSWER_PATH="/workspace/answer.json"
13+
TASK_SPEC_PATH="/tests/task_spec.json"
14+
ORACLE_CHECKS="/tests/oracle_checks.py"
15+
REWARD_PATH="/logs/verifier/reward.txt"
16+
17+
mkdir -p /logs/verifier
18+
19+
echo "=== CCX-compliance-182 evaluator ==="
20+
echo "Task spec: $TASK_SPEC_PATH"
21+
echo "Answer: $ANSWER_PATH"
22+
echo ""
23+
24+
# sg_only mode guard: restore full repo if verifier wrapper exists
25+
if [ -f /tmp/.sg_only_mode ] && [ -f /tests/sgonly_verifier_wrapper.sh ]; then
26+
echo "sg_only mode: sourcing verifier wrapper..."
27+
source /tests/sgonly_verifier_wrapper.sh
28+
fi
29+
30+
# Verify answer file exists
31+
if [ ! -f "$ANSWER_PATH" ]; then
32+
echo "ERROR: answer.json not found at $ANSWER_PATH"
33+
echo "0.0" > "$REWARD_PATH"
34+
exit 1
35+
fi
36+
37+
# Validate answer is valid JSON
38+
if ! python3 -c "import json; json.load(open('$ANSWER_PATH'))" 2>/dev/null; then
39+
echo "ERROR: answer.json is not valid JSON"
40+
echo "0.0" > "$REWARD_PATH"
41+
exit 1
42+
fi
43+
44+
echo "answer.json found and valid JSON"
45+
46+
# Run oracle checks
47+
if [ ! -f "$ORACLE_CHECKS" ]; then
48+
echo "ERROR: oracle_checks.py not found at $ORACLE_CHECKS"
49+
echo "0.0" > "$REWARD_PATH"
50+
exit 1
51+
fi
52+
53+
echo "Running oracle checks..."
54+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
55+
56+
# Validate score is a number
57+
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then
58+
echo "ERROR: oracle_checks.py did not return a valid score: $SCORE"
59+
echo "0.0" > "$REWARD_PATH"
60+
exit 1
61+
fi
62+
63+
echo ""
64+
echo "Composite score: $SCORE"
65+
echo "$SCORE" > "$REWARD_PATH"
66+
67+
# Exit based on score (SWE-Factory exit-code-first pattern)
68+
python3 -c "import sys; sys.exit(0 if float('$SCORE') > 0 else 1)"
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"files": [],
3+
"text": "Oracle not yet curated \u2014 task stub.",
4+
"_metadata": {
5+
"oracle_type": "tbd",
6+
"status": "pending_curation"
7+
}
8+
}

0 commit comments

Comments
 (0)