sourcegraph
diff --git a/‎agents/claude_baseline_agent.py‎
Lines changed: 44 additions & 37 deletions b/‎agents/claude_baseline_agent.py‎
Lines changed: 44 additions & 37 deletions
diff --git a/‎benchmarks/csb/crossrepo/ccx-config-trace-010/environment/Dockerfile‎
Lines changed: 38 additions & 0 deletions b/‎benchmarks/csb/crossrepo/ccx-config-trace-010/environment/Dockerfile‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎benchmarks/csb/crossrepo/ccx-config-trace-010/environment/Dockerfile.artifact_baseline‎
Lines changed: 44 additions & 0 deletions b/‎benchmarks/csb/crossrepo/ccx-config-trace-010/environment/Dockerfile.artifact_baseline‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎benchmarks/csb/crossrepo/ccx-config-trace-010/environment/Dockerfile.artifact_only‎
Lines changed: 35 additions & 0 deletions b/‎benchmarks/csb/crossrepo/ccx-config-trace-010/environment/Dockerfile.artifact_only‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎benchmarks/csb/crossrepo/ccx-config-trace-010/environment/Dockerfile.sg_only‎
Lines changed: 36 additions & 0 deletions b/‎benchmarks/csb/crossrepo/ccx-config-trace-010/environment/Dockerfile.sg_only‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎benchmarks/csb/crossrepo/ccx-config-trace-010/instruction.md‎
Lines changed: 46 additions & 0 deletions b/‎benchmarks/csb/crossrepo/ccx-config-trace-010/instruction.md‎
Lines changed: 46 additions & 0 deletions
@@ -77,7 +77,24 @@
 - ✅ `cargo check -p crate 2>&1 | head -100` — `head` returns as soon as it has enough output
 - ✅ Run long builds in the background and check output later
 
-**If a command takes more than 60 seconds, it is probably too broad.** Kill it, narrow the scope, and retry."""
+**If a command takes more than 60 seconds, it is probably too broad.** Kill it, narrow the scope, and retry.
+
+## OUTPUT ARTIFACT
+
+After completing your work, also write `/workspace/answer.json` summarizing what you did:
+```json
+{
+  "analysis": {
+    "summary": "Brief description of your approach",
+    "files_examined": [{"path": "file.ext", "description": "why you looked at it"}],
+    "reasoning": "Detailed explanation or analysis"
+  },
+  "changes": [
+    {"file": "path.ext", "description": "what you changed", "diff": "unified diff"}
+  ]
+}
+```
+Include `changes` with unified diffs for every file you modified. For analysis-only tasks, omit `changes` and focus on `analysis`. This artifact is scored independently from your direct edits."""
 
 
 # Concise Sourcegraph MCP tool reference for CLAUDE.md injection.
@@ -106,7 +123,7 @@
 # No truncation language — local source files simply aren't present.
 # 25% shorter than V4: removes Workflows, Output Formatting, Common Mistakes, Query Patterns.
 # {repo_scope} is replaced at runtime with the target repository filter.
-# {workflow_tail} is replaced with edit+test steps (direct) or produce-artifact step (artifact_full).
+# {workflow_tail} is replaced with edit+test+answer.json steps (always both direct and artifact).
 V5_PREAMBLE_TEMPLATE = """# IMPORTANT: Source Code Access
 
 **Local source files are not present.** Your workspace does not contain source code. You **MUST** use Sourcegraph MCP tools to discover, read, and understand code before making any changes.
@@ -643,35 +660,30 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
                 )
                 repo_scope += branch_instructions
 
-            # Workflow steps 3-4 vary by config: direct configs edit+test
-            # locally, artifact configs produce diffs as output artifacts.
-            if mcp_type == "artifact_full":
-                workflow_tail = (
-                    "3. **Produce answer.json** — Write ALL output to "
-                    "`/workspace/answer.json` with this structure:\n"
-                    "   ```json\n"
-                    "   {\n"
-                    '     "analysis": {\n'
-                    '       "summary": "Brief description of your approach",\n'
-                    '       "files_examined": [{"path": "file.ext", "description": "..."}],\n'
-                    '       "reasoning": "Detailed explanation or analysis"\n'
-                    "     },\n"
-                    '     "changes": [\n'
-                    '       {"file": "path.ext", "description": "...", "diff": "unified diff"}\n'
-                    "     ]\n"
-                    "   }\n"
-                    "   ```\n"
-                    "   Omit `changes` if the task is analysis-only. "
-                    "Do NOT edit source files directly — produce diffs in "
-                    "`changes[]` instead."
-                )
-            else:
-                workflow_tail = (
-                    "3. **Edit locally** — Use Edit, Write, and Bash to "
-                    "create or modify files in your working directory\n"
-                    "4. **Verify locally** — Run tests with Bash to check "
-                    "your changes"
-                )
+            # Workflow steps 3-5: agent always does BOTH direct edits AND
+            # produces answer.json.  The verifier scores both independently.
+            workflow_tail = (
+                "3. **Edit locally** — Use Edit, Write, and Bash to "
+                "create or modify files in your working directory\n"
+                "4. **Verify locally** — Run tests with Bash to check "
+                "your changes\n"
+                "5. **Produce answer.json** — After completing your edits, "
+                "also write `/workspace/answer.json` summarizing your work:\n"
+                "   ```json\n"
+                "   {\n"
+                '     "analysis": {\n'
+                '       "summary": "Brief description of your approach",\n'
+                '       "files_examined": [{"path": "file.ext", "description": "..."}],\n'
+                '       "reasoning": "Detailed explanation or analysis"\n'
+                "     },\n"
+                '     "changes": [\n'
+                '       {"file": "path.ext", "description": "...", "diff": "unified diff"}\n'
+                "     ]\n"
+                "   }\n"
+                "   ```\n"
+                "   Include `changes` with unified diffs for every file you modified. "
+                "For analysis-only tasks, omit `changes` and focus on `analysis`."
+            )
 
             mcp_preamble = V5_PREAMBLE_TEMPLATE.format(
                 repo_scope=repo_scope, workflow_tail=workflow_tail
@@ -830,12 +842,7 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
             else:
                 repo_filter_system = "Use list_repos to discover available repositories first."
 
-            if mcp_type == "artifact_full":
-                mcp_system_prompt = f"""IMPORTANT: Local source files are not present. You MUST use Sourcegraph MCP tools to discover and read code. Write ALL output to /workspace/answer.json with "analysis" (summary, files_examined, reasoning) and optional "changes" (file, description, diff) arrays. Do NOT edit source files directly.
-
-{repo_filter_system}"""
-            else:
-                mcp_system_prompt = f"""IMPORTANT: Local source files are not present. You MUST use Sourcegraph MCP tools to discover and read code, then create or edit local files based on what you learn. Run tests locally to verify your changes.
+            mcp_system_prompt = f"""IMPORTANT: Local source files are not present. You MUST use Sourcegraph MCP tools to discover and read code, then create or edit local files based on what you learn. Run tests locally to verify your changes. After completing edits, also write /workspace/answer.json with "analysis" (summary, files_examined, reasoning) and "changes" (file, description, diff) arrays.
 
 {repo_filter_system}"""
             system_prompt_append = EVALUATION_CONTEXT_PROMPT + "\n\n---\n\n" + mcp_system_prompt
 
@@ -0,0 +1,38 @@
+FROM ubuntu:22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Base tools
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    ca-certificates \
+    curl \
+    python3 \
+    golang-go \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create claude user BEFORE cloning so files are owned correctly from the
+# start.  This avoids a post-clone chown -R layer that doubles image size
+# and takes 15-30 min on overlay2 (copy-on-write duplicates every inode).
+RUN adduser --disabled-password --gecos '' claude 2>/dev/null || true
+RUN mkdir -p /workspace /logs/agent /logs/verifier && \
+    chown -R claude:claude /workspace /logs
+
+# Clone as claude — files land claude-owned, no separate chown layer needed.
+USER claude
+WORKDIR /workspace
+# Clone local checkout repos (baseline config: agent has local access to these)
+RUN git clone --depth 1 https://github.com/sg-evals/kubernetes--v1.32.0 /workspace/kubernetes--v1.32.0
+RUN git clone --depth 1 https://github.com/sg-evals/client-go--v0.32.0 /workspace/client-go--v0.32.0
+RUN git clone --depth 1 https://github.com/sg-evals/api--v0.32.0 /workspace/api--v0.32.0
+RUN git clone --depth 1 https://github.com/sg-evals/etcd-io-etcd /workspace/etcd-io-etcd
+
+# Initialize git identity for agent commits
+RUN git config --global user.email "agent@example.com" && \
+    git config --global user.name "Agent" && \
+    git config --global safe.directory '*'
+
+# Switch back to root for Harbor's runtime setup
+USER root
+
+ENTRYPOINT []
@@ -0,0 +1,44 @@
+# ccx-config-trace-010 — artifact_baseline variant
+# Baseline with local code + artifact mode (verifier parses answer.json).
+
+FROM ubuntu:22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Base tools
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    ca-certificates \
+    curl \
+    python3 \
+    golang-go \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create claude user BEFORE cloning so files are owned correctly from the
+# start.  This avoids a post-clone chown -R layer that doubles image size
+# and takes 15-30 min on overlay2 (copy-on-write duplicates every inode).
+RUN adduser --disabled-password --gecos '' claude 2>/dev/null || true
+RUN mkdir -p /workspace /logs/agent /logs/verifier && \
+    chown -R claude:claude /workspace /logs
+
+# Clone as claude — files land claude-owned, no separate chown layer needed.
+USER claude
+WORKDIR /workspace
+# Clone local checkout repos (baseline config: agent has local access to these)
+RUN git clone --depth 1 https://github.com/sg-evals/kubernetes--v1.32.0 /workspace/kubernetes--v1.32.0
+RUN git clone --depth 1 https://github.com/sg-evals/client-go--v0.32.0 /workspace/client-go--v0.32.0
+RUN git clone --depth 1 https://github.com/sg-evals/api--v0.32.0 /workspace/api--v0.32.0
+RUN git clone --depth 1 https://github.com/sg-evals/etcd-io-etcd /workspace/etcd-io-etcd
+
+# Initialize git identity for agent commits
+RUN git config --global user.email "agent@example.com" && \
+    git config --global user.name "Agent" && \
+    git config --global safe.directory '*'
+
+# Switch back to root for Harbor's runtime setup
+USER root
+
+# Mark artifact-only mode — verifier parses answer.json
+RUN touch /tmp/.artifact_only_mode
+
+ENTRYPOINT []
@@ -0,0 +1,35 @@
+# ccx-config-trace-010 — artifact_only variant
+# No local repo clone — agent uses Sourcegraph MCP exclusively for code access.
+# Agent produces answer.json artifact; verifier scores the artifact.
+
+FROM ubuntu:22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV SOURCEGRAPH_REPOS="sg-evals/api--v0.32.0,sg-evals/client-go--v0.32.0,sg-evals/etcd-io-etcd,sg-evals/kubernetes--v1.32.0"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    ca-certificates \
+    python3 \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+# Empty workspace — agent discovers code via MCP tools only
+RUN git init && \
+    git config user.email "agent@example.com" && \
+    git config user.name "Agent" && \
+    git config --global safe.directory '*'
+
+# Create log directories
+RUN mkdir -p /logs/agent /logs/verifier
+
+# Mark artifact-only mode — verifiers and eval scripts check this flag
+RUN touch /tmp/.artifact_only_mode
+
+# Pre-create claude user and set ownership at build time.
+RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
+    for d in /workspace /app /testbed /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
+
+ENTRYPOINT []
@@ -0,0 +1,36 @@
+# ccx-config-trace-010 — sg_only_env variant (v2: clone-at-verify)
+# Empty workspace — agent uses Sourcegraph MCP for code access.
+# Verifier clones mirror(s) at verification time via clone manifest.
+
+FROM ubuntu:22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV SOURCEGRAPH_REPOS="sg-evals/api--v0.32.0,sg-evals/client-go--v0.32.0,sg-evals/etcd-io-etcd,sg-evals/kubernetes--v1.32.0"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    ca-certificates \
+    python3 \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+# Empty git repo so agent can commit work
+RUN git init && \
+    git config user.email "agent@example.com" && \
+    git config user.name "Agent"
+
+RUN mkdir -p /logs/agent /logs/verifier
+
+# Clone manifest for verifier (clone-at-verify strategy)
+RUN echo '{"workdir":"/workspace","repos":[{"mirror":"sg-evals/kubernetes--v1.32.0","target_dir":"kubernetes--v1.32.0"},{"mirror":"sg-evals/client-go--v0.32.0","target_dir":"client-go--v0.32.0"},{"mirror":"sg-evals/api--v0.32.0","target_dir":"api--v0.32.0"},{"mirror":"sg-evals/etcd-io-etcd","target_dir":"etcd-io-etcd"}]}' > /tmp/.sg_only_clone_manifest.json
+
+# Mark sg_only mode
+RUN touch /tmp/.sg_only_mode
+
+# Pre-create claude user and set ownership at build time.
+RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
+    for d in /workspace /app /testbed /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
+
+ENTRYPOINT []
@@ -0,0 +1,46 @@
+# Stack Trace Symbol Resolution: rest.Config
+
+## Your Task
+
+Find the repository and file path where the `Config` struct is defined (not vendored) in the `rest` package of `k8s.io/client-go`. What is the exact Go package import path?
+
+## Context
+
+You are working on a codebase task involving repos from the crossrepo tracing domain.
+
+## Available Resources
+
+The local `/workspace/` directory contains: sg-evals/kubernetes--v1.32.0, sg-evals/client-go--v0.32.0, sg-evals/api--v0.32.0, sg-evals/etcd-io-etcd.
+
+
+## Output Format
+
+Use the published task contract:
+
+- `TASK_WORKDIR=/workspace`
+- `TASK_REPO_ROOT=/workspace`
+- `TASK_OUTPUT=/workspace/answer.json`
+
+Create a file at `TASK_OUTPUT` (`/workspace/answer.json`) with your findings in the following structure:
+
+```json
+{
+  "files": [
+    {"repo": "repo-name", "path": "relative/path/to/file.go"}
+  ],
+  "symbols": [
+    {"repo": "repo-name", "path": "relative/path/to/file.go", "symbol": "SymbolName"}
+  ],
+  "chain": [
+    {"repo": "repo-name", "path": "relative/path/to/file.go", "symbol": "FunctionName"}
+  ],
+  "text": "Narrative explanation of your findings, citing repos and file paths."
+}
+```
+
+Include only the fields relevant to this task. Your answer is evaluated against a closed-world oracle — completeness matters.
+
+## Evaluation
+
+Your answer will be scored on:
+- **File recall and precision**: Did you find all relevant files?