sourcegraph
diff --git a/‎benchmarks/csb_org_compliance/ccx-compliance-286/environment/Dockerfile‎
Lines changed: 32 additions & 0 deletions b/‎benchmarks/csb_org_compliance/ccx-compliance-286/environment/Dockerfile‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎benchmarks/csb_org_compliance/ccx-compliance-286/environment/Dockerfile.artifact_baseline‎
Lines changed: 38 additions & 0 deletions b/‎benchmarks/csb_org_compliance/ccx-compliance-286/environment/Dockerfile.artifact_baseline‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎benchmarks/csb_org_compliance/ccx-compliance-286/environment/Dockerfile.artifact_only‎
Lines changed: 35 additions & 0 deletions b/‎benchmarks/csb_org_compliance/ccx-compliance-286/environment/Dockerfile.artifact_only‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎benchmarks/csb_org_compliance/ccx-compliance-286/environment/Dockerfile.sg_only‎
Lines changed: 36 additions & 0 deletions b/‎benchmarks/csb_org_compliance/ccx-compliance-286/environment/Dockerfile.sg_only‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎benchmarks/csb_org_compliance/ccx-compliance-286/instruction.md‎
Lines changed: 39 additions & 0 deletions b/‎benchmarks/csb_org_compliance/ccx-compliance-286/instruction.md‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎benchmarks/csb_org_compliance/ccx-compliance-286/instruction_mcp.md‎
Lines changed: 111 additions & 0 deletions b/‎benchmarks/csb_org_compliance/ccx-compliance-286/instruction_mcp.md‎
Lines changed: 111 additions & 0 deletions
diff --git a/‎benchmarks/csb_org_compliance/ccx-compliance-286/task.toml‎
Lines changed: 29 additions & 0 deletions b/‎benchmarks/csb_org_compliance/ccx-compliance-286/task.toml‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎benchmarks/csb_org_compliance/ccx-compliance-286/tests/eval.sh‎
Lines changed: 68 additions & 0 deletions b/‎benchmarks/csb_org_compliance/ccx-compliance-286/tests/eval.sh‎
Lines changed: 68 additions & 0 deletions
@@ -0,0 +1,32 @@
+FROM ubuntu:22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Base tools
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    ca-certificates \
+    curl \
+    python3 \
+    golang-go \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+# Clone local checkout repos (baseline config: agent has local access to these)
+RUN git clone --depth 1 https://github.com/sg-evals/tidb--v8.5.0 /workspace/tidb--v8.5.0
+
+# Initialize git identity for agent commits
+RUN git config --global user.email "agent@example.com" && \
+    git config --global user.name "Agent" && \
+    git config --global safe.directory '*'
+
+# Create log directories
+RUN mkdir -p /logs/agent /logs/verifier
+
+# Pre-create claude user and set ownership at build time so Harbor's
+# runtime chown is a no-op (avoids 15-30 min delay on large repos).
+RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
+    for d in /workspace /app /testbed /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
+
+ENTRYPOINT []
@@ -0,0 +1,38 @@
+# ccx-compliance-286 — artifact_baseline variant
+# Baseline with local code + artifact mode (verifier parses answer.json).
+
+FROM ubuntu:22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Base tools
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    ca-certificates \
+    curl \
+    python3 \
+    golang-go \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+# Clone local checkout repos (baseline config: agent has local access to these)
+RUN git clone --depth 1 https://github.com/sg-evals/tidb--v8.5.0 /workspace/tidb--v8.5.0
+
+# Initialize git identity for agent commits
+RUN git config --global user.email "agent@example.com" && \
+    git config --global user.name "Agent" && \
+    git config --global safe.directory '*'
+
+# Create log directories
+RUN mkdir -p /logs/agent /logs/verifier
+
+# Pre-create claude user and set ownership at build time so Harbor's
+# runtime chown is a no-op (avoids 15-30 min delay on large repos).
+RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
+    for d in /workspace /app /testbed /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
+
+# Mark artifact-only mode — verifier parses answer.json
+RUN touch /tmp/.artifact_only_mode
+
+ENTRYPOINT []
@@ -0,0 +1,35 @@
+# ccx-compliance-286 — artifact_only variant
+# No local repo clone — agent uses Sourcegraph MCP exclusively for code access.
+# Agent produces answer.json artifact; verifier scores the artifact.
+
+FROM ubuntu:22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV SOURCEGRAPH_REPOS="sg-evals/tidb--v8.5.0"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    ca-certificates \
+    python3 \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+# Empty workspace — agent discovers code via MCP tools only
+RUN git init && \
+    git config user.email "agent@example.com" && \
+    git config user.name "Agent" && \
+    git config --global safe.directory '*'
+
+# Create log directories
+RUN mkdir -p /logs/agent /logs/verifier
+
+# Mark artifact-only mode — verifiers and eval scripts check this flag
+RUN touch /tmp/.artifact_only_mode
+
+# Pre-create claude user and set ownership at build time.
+RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
+    for d in /workspace /app /testbed /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
+
+ENTRYPOINT []
@@ -0,0 +1,36 @@
+# ccx-compliance-286 — sg_only_env variant (v2: clone-at-verify)
+# Empty workspace — agent uses Sourcegraph MCP for code access.
+# Verifier clones mirror(s) at verification time via clone manifest.
+
+FROM ubuntu:22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV SOURCEGRAPH_REPOS="sg-evals/tidb--v8.5.0"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    ca-certificates \
+    python3 \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+# Empty git repo so agent can commit work
+RUN git init && \
+    git config user.email "agent@example.com" && \
+    git config user.name "Agent"
+
+RUN mkdir -p /logs/agent /logs/verifier
+
+# Clone manifest for verifier (clone-at-verify strategy)
+RUN echo '{"workdir":"/workspace","repos":[{"mirror":"sg-evals/tidb--v8.5.0","target_dir":"tidb--v8.5.0"}]}' > /tmp/.sg_only_clone_manifest.json
+
+# Mark sg_only mode
+RUN touch /tmp/.sg_only_mode
+
+# Pre-create claude user and set ownership at build time.
+RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
+    for d in /workspace /app /testbed /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
+
+ENTRYPOINT []
@@ -0,0 +1,39 @@
+# TiDB SQL Privilege Check and RBAC Enforcement Audit
+
+## Your Task
+
+Audit the privilege and RBAC enforcement in pingcap/tidb. Find all Go source files under pkg/privilege/ and pkg/planner/ that implement privilege checking for SQL statements. Identify: the PrivilegeManager interface, the MySQLPrivilege cache, the RequestVerification method and its callers, the role graph resolution logic, and any files that map SQL statement types to required privileges. For each file report the path and its role in the privilege check flow.
+
+## Context
+
+You are working on a codebase task involving repos from the compliance domain.
+
+## Available Resources
+
+The local `/workspace/` directory contains: sg-evals/tidb--v8.5.0.
+
+## Output Format
+
+Create a file at `/workspace/answer.json` with your findings in the following structure:
+
+```json
+{
+  "files": [
+    {"repo": "org/repo-name", "path": "relative/path/to/file.go"}
+  ],
+  "symbols": [
+    {"repo": "org/repo-name", "path": "relative/path/to/file.go", "symbol": "SymbolName"}
+  ],
+  "chain": [
+    {"repo": "org/repo-name", "path": "relative/path/to/file.go", "symbol": "FunctionName"}
+  ],
+  "text": "Narrative explanation of your findings, citing repos and file paths."
+}
+```
+
+Include only the fields relevant to this task. Your answer is evaluated against a closed-world oracle — completeness matters.
+
+## Evaluation
+
+Your answer will be scored on:
+- **File recall and precision**: Did you find all relevant files?
@@ -0,0 +1,111 @@
+# IMPORTANT: Source Code Access
+
+**Local source files are not present.** Your workspace does not contain source code. You **MUST** use Sourcegraph MCP tools to discover, read, and understand code before making any changes.
+
+**Target Repositories (version-pinned mirrors):**
+
+- `github.com/sg-evals/tidb--v8.5.0` — use `repo:^github.com/sg-evals/tidb--v8.5.0$` filter
+
+Scope ALL keyword_search/nls_search queries to these repos.
+Use the repo name as the `repo` parameter for read_file/go_to_definition/find_references.
+
+
+## Required Workflow
+
+1. **Search first** — Use MCP tools to find relevant files and understand existing patterns
+2. **Read remotely** — Use `sg_read_file` to read full file contents from Sourcegraph
+3. **Edit locally** — Use Edit, Write, and Bash to create or modify files in your working directory
+4. **Verify locally** — Run tests with Bash to check your changes
+
+## Tool Selection
+
+| Goal | Tool |
+|------|------|
+| Exact symbol/string | `sg_keyword_search` |
+| Concepts/semantic search | `sg_nls_search` |
+| Trace usage/callers | `sg_find_references` |
+| See implementation | `sg_go_to_definition` |
+| Read full file | `sg_read_file` |
+| Browse structure | `sg_list_files` |
+| Find repos | `sg_list_repos` |
+| Search commits | `sg_commit_search` |
+| Track changes | `sg_diff_search` |
+| Compare versions | `sg_compare_revisions` |
+
+**Decision logic:**
+1. Know the exact symbol? → `sg_keyword_search`
+2. Know the concept, not the name? → `sg_nls_search`
+3. Need definition of a symbol? → `sg_go_to_definition`
+4. Need all callers/references? → `sg_find_references`
+5. Need full file content? → `sg_read_file`
+
+## Scoping (Always Do This)
+
+```
+repo:^github.com/ORG/REPO$           # Exact repo (preferred)
+repo:github.com/ORG/                 # All repos in org
+file:.*\.ts$                         # TypeScript only
+file:src/api/                        # Specific directory
+```
+
+Start narrow. Expand only if results are empty.
+
+## Efficiency Rules
+
+- Chain searches logically: search → read → references → definition
+- Don't re-search for the same pattern; use results from prior calls
+- Prefer `sg_keyword_search` over `sg_nls_search` when you have exact terms
+- Read 2-3 related files before synthesising, rather than one at a time
+- Don't read 20+ remote files without writing code — once you understand the pattern, start implementing
+
+## If Stuck
+
+If MCP search returns no results:
+1. Broaden the search query (synonyms, partial identifiers)
+2. Try `sg_nls_search` for semantic matching
+3. Use `sg_list_files` to browse the directory structure
+4. Use `sg_list_repos` to verify the repository name
+
+---
+
+**Sourcegraph Repositories:** `github.com/sg-evals/tidb--v8.5.0`
+
+# TiDB SQL Privilege Check and RBAC Enforcement Audit
+
+## Your Task
+
+Audit the privilege and RBAC enforcement in pingcap/tidb. Find all Go source files under pkg/privilege/ and pkg/planner/ that implement privilege checking for SQL statements. Identify: the PrivilegeManager interface, the MySQLPrivilege cache, the RequestVerification method and its callers, the role graph resolution logic, and any files that map SQL statement types to required privileges. For each file report the path and its role in the privilege check flow.
+
+## Context
+
+You are working on a codebase task involving repos from the compliance domain.
+
+## Available Resources
+
+The local `/workspace/` directory contains: sg-evals/tidb--v8.5.0.
+
+## Output Format
+
+Create a file at `/workspace/answer.json` with your findings in the following structure:
+
+```json
+{
+  "files": [
+    {"repo": "org/repo-name", "path": "relative/path/to/file.go"}
+  ],
+  "symbols": [
+    {"repo": "org/repo-name", "path": "relative/path/to/file.go", "symbol": "SymbolName"}
+  ],
+  "chain": [
+    {"repo": "org/repo-name", "path": "relative/path/to/file.go", "symbol": "FunctionName"}
+  ],
+  "text": "Narrative explanation of your findings, citing repos and file paths."
+}
+```
+
+Include only the fields relevant to this task. Your answer is evaluated against a closed-world oracle — completeness matters.
+
+## Evaluation
+
+Your answer will be scored on:
+- **File recall and precision**: Did you find all relevant files?
@@ -0,0 +1,29 @@
+version = "1.0"
+
+[metadata]
+name = "CCX-compliance-286"
+description = "TiDB SQL Privilege Check and RBAC Enforcement Audit"
+license = "Apache-2.0"
+
+[task]
+id = "CCX-compliance-286"
+repo = "sg-evals/tidb--v8.5.0"
+category = "compliance-audit"
+language = "go"
+difficulty = "hard"
+time_limit_sec = 900
+mcp_suite = "csb_org_compliance"
+use_case_id = 286
+repo_set_id = "tidb-database"
+mcp_unique = true
+verification_modes = ["artifact"]
+
+[verification]
+type = "test"
+command = "bash /tests/test.sh"
+
+reward_type = "score"
+description = "TiDB SQL Privilege Check and RBAC Enforcement Audit"
+
+[environment]
+build_timeout_sec = 600.0
@@ -0,0 +1,68 @@
+#!/bin/bash
+# eval.sh — MCP-unique benchmark evaluator for CCX-compliance-286
+# Exit-code-first (SWE-Factory pattern):
+#   exit 0 — agent produced useful output (composite score > 0)
+#   exit 1 — total failure (composite score == 0 or missing answer)
+#
+# Writes /logs/verifier/reward.txt with the composite score [0.0, 1.0]
+
+set -euo pipefail
+
+TASK_ID="CCX-compliance-286"
+ANSWER_PATH="/workspace/answer.json"
+TASK_SPEC_PATH="/tests/task_spec.json"
+ORACLE_CHECKS="/tests/oracle_checks.py"
+REWARD_PATH="/logs/verifier/reward.txt"
+
+mkdir -p /logs/verifier
+
+echo "=== CCX-compliance-286 evaluator ==="
+echo "Task spec: $TASK_SPEC_PATH"
+echo "Answer:    $ANSWER_PATH"
+echo ""
+
+# sg_only mode guard: restore full repo if verifier wrapper exists
+if [ -f /tmp/.sg_only_mode ] && [ -f /tests/sgonly_verifier_wrapper.sh ]; then
+    echo "sg_only mode: sourcing verifier wrapper..."
+    source /tests/sgonly_verifier_wrapper.sh
+fi
+
+# Verify answer file exists
+if [ ! -f "$ANSWER_PATH" ]; then
+    echo "ERROR: answer.json not found at $ANSWER_PATH"
+    echo "0.0" > "$REWARD_PATH"
+    exit 1
+fi
+
+# Validate answer is valid JSON
+if ! python3 -c "import json; json.load(open('$ANSWER_PATH'))" 2>/dev/null; then
+    echo "ERROR: answer.json is not valid JSON"
+    echo "0.0" > "$REWARD_PATH"
+    exit 1
+fi
+
+echo "answer.json found and valid JSON"
+
+# Run oracle checks
+if [ ! -f "$ORACLE_CHECKS" ]; then
+    echo "ERROR: oracle_checks.py not found at $ORACLE_CHECKS"
+    echo "0.0" > "$REWARD_PATH"
+    exit 1
+fi
+
+echo "Running oracle checks..."
+SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
+
+# Validate score is a number
+if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then
+    echo "ERROR: oracle_checks.py did not return a valid score: $SCORE"
+    echo "0.0" > "$REWARD_PATH"
+    exit 1
+fi
+
+echo ""
+echo "Composite score: $SCORE"
+echo "$SCORE" > "$REWARD_PATH"
+
+# Exit based on score (SWE-Factory exit-code-first pattern)
+python3 -c "import sys; sys.exit(0 if float('$SCORE') > 0 else 1)"