sourcegraph
diff --git a/‎benchmarks/ccb_mcp_compliance/ccx-compliance-182/environment/Dockerfile‎
Lines changed: 35 additions & 0 deletions b/‎benchmarks/ccb_mcp_compliance/ccx-compliance-182/environment/Dockerfile‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎benchmarks/ccb_mcp_compliance/ccx-compliance-182/environment/Dockerfile.artifact_only‎
Lines changed: 34 additions & 0 deletions b/‎benchmarks/ccb_mcp_compliance/ccx-compliance-182/environment/Dockerfile.artifact_only‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎benchmarks/ccb_mcp_compliance/ccx-compliance-182/environment/Dockerfile.sg_only‎
Lines changed: 35 additions & 0 deletions b/‎benchmarks/ccb_mcp_compliance/ccx-compliance-182/environment/Dockerfile.sg_only‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎benchmarks/ccb_mcp_compliance/ccx-compliance-182/instruction.md‎
Lines changed: 42 additions & 0 deletions b/‎benchmarks/ccb_mcp_compliance/ccx-compliance-182/instruction.md‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎benchmarks/ccb_mcp_compliance/ccx-compliance-182/task.toml‎
Lines changed: 29 additions & 0 deletions b/‎benchmarks/ccb_mcp_compliance/ccx-compliance-182/task.toml‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎benchmarks/ccb_mcp_compliance/ccx-compliance-182/tests/eval.sh‎
Lines changed: 68 additions & 0 deletions b/‎benchmarks/ccb_mcp_compliance/ccx-compliance-182/tests/eval.sh‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎benchmarks/ccb_mcp_compliance/ccx-compliance-182/tests/oracle_answer.json‎
Lines changed: 8 additions & 0 deletions b/‎benchmarks/ccb_mcp_compliance/ccx-compliance-182/tests/oracle_answer.json‎
Lines changed: 8 additions & 0 deletions
@@ -0,0 +1,35 @@
+FROM ubuntu:22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Base tools
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    ca-certificates \
+    curl \
+    python3 \
+    golang-go \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+# Clone local checkout repos (baseline config: agent has local access to these)
+RUN git clone --depth 1 https://github.com/sg-evals/kubernetes--v1.32.0 /workspace/kubernetes--v1.32.0
+RUN git clone --depth 1 https://github.com/sg-evals/client-go--v0.32.0 /workspace/client-go--v0.32.0
+RUN git clone --depth 1 https://github.com/sg-evals/api--v0.32.0 /workspace/api--v0.32.0
+RUN git clone --depth 1 https://github.com/sg-evals/etcd-io-etcd /workspace/etcd-io-etcd
+
+# Initialize git identity for agent commits
+RUN git config --global user.email "agent@example.com" && \
+    git config --global user.name "Agent" && \
+    git config --global safe.directory '*'
+
+# Create log directories
+RUN mkdir -p /logs/agent /logs/verifier
+
+# Pre-create claude user and set ownership at build time so Harbor's
+# runtime chown is a no-op (avoids 15-30 min delay on large repos).
+RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
+    for d in /workspace /app /testbed /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
+
+ENTRYPOINT []
@@ -0,0 +1,34 @@
+# CCX-compliance-182 — artifact_only variant
+# No local repo clone — agent uses Sourcegraph MCP exclusively for code access.
+# Agent produces answer.json artifact; verifier scores the artifact.
+
+FROM ubuntu:22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    ca-certificates \
+    python3 \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+# Empty workspace — agent discovers code via MCP tools only
+RUN git init && \
+    git config user.email "agent@example.com" && \
+    git config user.name "Agent" && \
+    git config --global safe.directory '*'
+
+# Create log directories
+RUN mkdir -p /logs/agent /logs/verifier
+
+# Mark artifact-only mode — verifiers and eval scripts check this flag
+RUN touch /tmp/.artifact_only_mode
+
+# Pre-create claude user and set ownership at build time.
+RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
+    for d in /workspace /app /testbed /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
+
+ENTRYPOINT []
@@ -0,0 +1,35 @@
+# CCX-compliance-182 — sg_only variant
+# No local repo clone — agent uses Sourcegraph MCP exclusively for code access.
+# The verifier clones mirror repos at verification time (no /repo_full/ backup).
+
+FROM ubuntu:22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    ca-certificates \
+    python3 \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+# Empty workspace — agent discovers code via MCP tools only
+RUN git init && \
+    git config user.email "agent@example.com" && \
+    git config user.name "Agent" && \
+    git config --global safe.directory '*'
+
+# Create log directories
+RUN mkdir -p /logs/agent /logs/verifier
+
+# Mark sg_only mode — verifiers and eval scripts check this flag
+RUN touch /tmp/.sg_only_mode
+
+# Pre-create claude user and set ownership at build time so Harbor's
+# runtime chown is a no-op (avoids 15-30 min delay on large repos).
+RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
+    for d in /workspace /app /testbed /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
+
+ENTRYPOINT []
@@ -0,0 +1,42 @@
+# Kubernetes RBAC ClusterRole Verb Coverage Audit
+
+## Your Task
+
+Audit kubernetes/kubernetes for completeness of RBAC verb handling. Find all Go source files that define the set of allowed verbs (get, list, watch, create, update, patch, delete, deletecollection) and validate that all API handlers enforce these verb constraints.
+
+## Context
+
+You are working on a codebase task involving repos from the compliance domain.
+
+## Available Resources
+
+The local `/workspace/` directory contains: sg-evals/kubernetes--v1.32.0, sg-evals/client-go--v0.32.0, sg-evals/api--v0.32.0, sg-evals/etcd-io-etcd.
+
+**Note:** Additional repositories are accessible via Sourcegraph MCP tools:
+- `sg-evals/etcd-io-etcd` (etcd-io/etcd)
+
+## Output Format
+
+Create a file at `/workspace/answer.json` with your findings in the following structure:
+
+```json
+{
+  "files": [
+    {"repo": "org/repo-name", "path": "relative/path/to/file.go"}
+  ],
+  "symbols": [
+    {"repo": "org/repo-name", "path": "relative/path/to/file.go", "symbol": "SymbolName"}
+  ],
+  "chain": [
+    {"repo": "org/repo-name", "path": "relative/path/to/file.go", "symbol": "FunctionName"}
+  ],
+  "text": "Narrative explanation of your findings, citing repos and file paths."
+}
+```
+
+Include only the fields relevant to this task. Your answer is evaluated against a closed-world oracle — completeness matters.
+
+## Evaluation
+
+Your answer will be scored on:
+- **File recall and precision**: Did you find all relevant files?
@@ -0,0 +1,29 @@
+version = "1.0"
+
+[metadata]
+name = "CCX-compliance-182"
+description = "Kubernetes RBAC ClusterRole Verb Coverage Audit"
+license = "Apache-2.0"
+
+[task]
+id = "CCX-compliance-182"
+repo = "sg-evals/kubernetes--v1.32.0"
+category = "compliance-audit"
+language = "go"
+difficulty = "hard"
+time_limit_sec = 900
+mcp_suite = "ccb_mcp_compliance"
+use_case_id = 182
+repo_set_id = "kubernetes-ecosystem"
+mcp_unique = true
+verification_modes = ["artifact"]
+
+[verification]
+type = "test"
+command = "bash /tests/test.sh"
+
+reward_type = "score"
+description = "Kubernetes RBAC ClusterRole Verb Coverage Audit"
+
+[environment]
+build_timeout_sec = 600.0
@@ -0,0 +1,68 @@
+#!/bin/bash
+# eval.sh — MCP-unique benchmark evaluator for CCX-compliance-182
+# Exit-code-first (SWE-Factory pattern):
+#   exit 0 — agent produced useful output (composite score > 0)
+#   exit 1 — total failure (composite score == 0 or missing answer)
+#
+# Writes /logs/verifier/reward.txt with the composite score [0.0, 1.0]
+
+set -euo pipefail
+
+TASK_ID="CCX-compliance-182"
+ANSWER_PATH="/workspace/answer.json"
+TASK_SPEC_PATH="/tests/task_spec.json"
+ORACLE_CHECKS="/tests/oracle_checks.py"
+REWARD_PATH="/logs/verifier/reward.txt"
+
+mkdir -p /logs/verifier
+
+echo "=== CCX-compliance-182 evaluator ==="
+echo "Task spec: $TASK_SPEC_PATH"
+echo "Answer:    $ANSWER_PATH"
+echo ""
+
+# sg_only mode guard: restore full repo if verifier wrapper exists
+if [ -f /tmp/.sg_only_mode ] && [ -f /tests/sgonly_verifier_wrapper.sh ]; then
+    echo "sg_only mode: sourcing verifier wrapper..."
+    source /tests/sgonly_verifier_wrapper.sh
+fi
+
+# Verify answer file exists
+if [ ! -f "$ANSWER_PATH" ]; then
+    echo "ERROR: answer.json not found at $ANSWER_PATH"
+    echo "0.0" > "$REWARD_PATH"
+    exit 1
+fi
+
+# Validate answer is valid JSON
+if ! python3 -c "import json; json.load(open('$ANSWER_PATH'))" 2>/dev/null; then
+    echo "ERROR: answer.json is not valid JSON"
+    echo "0.0" > "$REWARD_PATH"
+    exit 1
+fi
+
+echo "answer.json found and valid JSON"
+
+# Run oracle checks
+if [ ! -f "$ORACLE_CHECKS" ]; then
+    echo "ERROR: oracle_checks.py not found at $ORACLE_CHECKS"
+    echo "0.0" > "$REWARD_PATH"
+    exit 1
+fi
+
+echo "Running oracle checks..."
+SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
+
+# Validate score is a number
+if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then
+    echo "ERROR: oracle_checks.py did not return a valid score: $SCORE"
+    echo "0.0" > "$REWARD_PATH"
+    exit 1
+fi
+
+echo ""
+echo "Composite score: $SCORE"
+echo "$SCORE" > "$REWARD_PATH"
+
+# Exit based on score (SWE-Factory exit-code-first pattern)
+python3 -c "import sys; sys.exit(0 if float('$SCORE') > 0 else 1)"
@@ -0,0 +1,8 @@
+{
+  "files": [],
+  "text": "Oracle not yet curated \u2014 task stub.",
+  "_metadata": {
+    "oracle_type": "tbd",
+    "status": "pending_curation"
+  }
+}