Skip to content

Commit fa591a4

Browse files
sjarmakclaude
andcommitted
feat: artifact-only mode with delete-not-truncate for MCP config
- Agent wrapper deletes source files (-delete) instead of truncating to empty (echo ""), preventing agents from wasting tokens reading visible-but-empty files in artifact_full mode - Both baseline and MCP-full use Dockerfile.artifact_only: baseline keeps source readable, MCP-full deletes source at runtime - sdlc_suite_2config.sh gains baseline+artifact_full Dockerfile swap case (same temp-dir pattern as sourcegraph_full) - Dockerfile.artifact_only keeps source in /workspace (no rm -rf) so baseline agent can read code locally - Add smoke_artifact_verifier.py for CI-style verification of artifact verifier pipeline (build + no-artifact + detection-only tests) - Paired run results: baseline=1.00, artifact_full=0.46 (detection OK, patches need iteration) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent aa76de2 commit fa591a4

File tree

4 files changed

+372
-3
lines changed

4 files changed

+372
-3
lines changed

agents/claude_baseline_agent.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -847,6 +847,23 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
847847
'cd "$WORKDIR"',
848848
]
849849

850+
# For artifact_full: delete source files so agent must use MCP.
851+
# Baseline keeps source readable (mcp_type == "none").
852+
# We delete rather than truncate to avoid agents wasting tokens
853+
# trying to read visible-but-empty files.
854+
if mcp_type == "artifact_full":
855+
script_lines.extend([
856+
'# Delete source files — agent must use MCP to read code',
857+
'find /workspace -type f \\( '
858+
'-name "*.cs" -o -name "*.py" -o -name "*.ts" -o -name "*.tsx" '
859+
'-o -name "*.js" -o -name "*.jsx" -o -name "*.go" -o -name "*.rs" '
860+
'-o -name "*.java" -o -name "*.c" -o -name "*.h" -o -name "*.cpp" '
861+
'-o -name "*.rb" -o -name "*.php" -o -name "*.swift" -o -name "*.kt" '
862+
'-o -name "*.scala" -o -name "*.vue" -o -name "*.svelte" '
863+
'\\) -delete',
864+
'echo "Source files deleted for artifact_full mode"',
865+
])
866+
850867
# If system prompt exists, read it from file and pass via --append-system-prompt
851868
if _system_prompt_content:
852869
script_lines.append('SYSPROMPT=$(cat /tmp/claude_system_prompt.txt)')

benchmarks/ccb_test/aspnetcore-code-review-001/environment/Dockerfile.artifact_only

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,10 @@ RUN mkdir -p /workspace/tests /logs/verifier
3636

3737
WORKDIR /workspace
3838

39-
# --- artifact_only: backup full repo, then clear workspace for agent ---
39+
# --- artifact_only: backup full repo for verifier scoring ---
40+
# Source stays in /workspace (readable by baseline agent).
41+
# MCP-full agent deletes source files at runtime via agent startup script.
4042
RUN cp -a /workspace /repo_full
41-
RUN rm -rf /workspace && mkdir -p /workspace
4243
RUN touch /tmp/.artifact_only_mode && echo '/workspace' > /tmp/.artifact_only_workdir
4344

4445
WORKDIR /workspace

configs/sdlc_suite_2config.sh

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,9 +215,22 @@ _sdlc_run_single() {
215215
return 1
216216
fi
217217

218+
# For artifact configs, BOTH baseline and full use Dockerfile.artifact_only
219+
# (baseline keeps source readable; MCP-full deletes source files at runtime).
220+
if [ "$config" = "baseline" ] && [ "${FULL_CONFIG}" = "artifact_full" ]; then
221+
local artifact="${task_path}/environment/Dockerfile.artifact_only"
222+
if [ -f "$artifact" ]; then
223+
temp_task_dir="/tmp/artifact_bl_${task_id}"
224+
rm -rf "$temp_task_dir"
225+
mkdir -p "$temp_task_dir"
226+
cp -a "${task_path}/." "${temp_task_dir}/"
227+
cp "${temp_task_dir}/environment/Dockerfile.artifact_only" "${temp_task_dir}/environment/Dockerfile"
228+
run_task_path="$temp_task_dir"
229+
fi
230+
218231
# For MCP-full runs, enforce sg_only Docker build env without mutating the
219232
# original task path (baseline may run in parallel on the same task).
220-
if [ "$config" = "sourcegraph_full" ]; then
233+
elif [ "$config" = "sourcegraph_full" ]; then
221234
local sgonly="${task_path}/environment/Dockerfile.sg_only"
222235
if [ ! -f "$sgonly" ]; then
223236
echo "ERROR: Missing Dockerfile.sg_only for $task_id at $sgonly"

scripts/smoke_artifact_verifier.py

Lines changed: 338 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,338 @@
1+
#!/usr/bin/env python3
2+
"""Smoke-test artifact-only verifiers without running an agent.
3+
4+
Builds Dockerfile.artifact_only, runs the verifier with mock artifacts,
5+
and checks that reward.txt is produced with expected scores.
6+
7+
Usage:
8+
python3 scripts/smoke_artifact_verifier.py --task benchmarks/ccb_test/aspnetcore-code-review-001
9+
python3 scripts/smoke_artifact_verifier.py --all # all tasks with Dockerfile.artifact_only
10+
python3 scripts/smoke_artifact_verifier.py --suite ccb_test
11+
"""
12+
13+
import argparse
14+
import json
15+
import os
16+
import shutil
17+
import subprocess
18+
import sys
19+
import tempfile
20+
import uuid
21+
from pathlib import Path
22+
23+
ROOT = Path(__file__).resolve().parent.parent
24+
25+
26+
def find_artifact_tasks(suite: str | None = None) -> list[Path]:
27+
"""Find all tasks that have a Dockerfile.artifact_only."""
28+
pattern = f"benchmarks/{suite or 'ccb_*'}/*"
29+
tasks = []
30+
for task_dir in sorted(ROOT.glob(pattern)):
31+
if not task_dir.is_dir():
32+
continue
33+
dockerfile = task_dir / "environment" / "Dockerfile.artifact_only"
34+
if dockerfile.exists():
35+
tasks.append(task_dir)
36+
return tasks
37+
38+
39+
def build_image(task_dir: Path, tag: str, timeout: int = 300) -> tuple[bool, str]:
40+
"""Build Dockerfile.artifact_only and return (success, message)."""
41+
env_dir = task_dir / "environment"
42+
dockerfile = env_dir / "Dockerfile.artifact_only"
43+
44+
if not dockerfile.exists():
45+
return False, f"No Dockerfile.artifact_only in {env_dir}"
46+
47+
cmd = [
48+
"docker", "build",
49+
"-f", str(dockerfile),
50+
"-t", tag,
51+
str(env_dir),
52+
]
53+
54+
try:
55+
result = subprocess.run(
56+
cmd, capture_output=True, text=True, timeout=timeout
57+
)
58+
if result.returncode != 0:
59+
return False, f"Docker build failed:\n{result.stderr[-500:]}"
60+
return True, "Build OK"
61+
except subprocess.TimeoutExpired:
62+
return False, f"Docker build timed out ({timeout}s)"
63+
64+
65+
def run_verifier(
66+
tag: str,
67+
tests_dir: Path,
68+
mock_files: dict[str, str] | None = None,
69+
timeout: int = 120,
70+
) -> tuple[bool, float | None, str]:
71+
"""Run verifier in container, return (success, reward, details).
72+
73+
Args:
74+
tag: Docker image tag
75+
tests_dir: Path to task's tests/ directory
76+
mock_files: Dict of {container_path: content} to create before running
77+
timeout: Seconds before killing container
78+
"""
79+
with tempfile.TemporaryDirectory(prefix="smoke_verifier_") as tmpdir:
80+
logs_dir = Path(tmpdir) / "logs"
81+
logs_dir.mkdir()
82+
(logs_dir / "verifier").mkdir()
83+
(logs_dir / "agent").mkdir()
84+
85+
# Build mock-file creation commands
86+
mock_cmds = []
87+
if mock_files:
88+
for path, content in mock_files.items():
89+
# Escape content for shell
90+
escaped = content.replace("'", "'\"'\"'")
91+
mock_cmds.append(f"mkdir -p $(dirname '{path}')")
92+
mock_cmds.append(f"cat > '{path}' << 'MOCK_EOF'\n{content}\nMOCK_EOF")
93+
94+
mock_script = "\n".join(mock_cmds)
95+
96+
run_script = f"""
97+
set -e
98+
mkdir -p /logs/verifier /logs/agent
99+
{mock_script}
100+
bash /tests/test.sh 2>/logs/verifier/test-stderr.txt
101+
"""
102+
103+
cmd = [
104+
"docker", "run", "--rm",
105+
"-v", f"{tests_dir}:/tests:ro",
106+
"-v", f"{logs_dir}:/logs",
107+
tag,
108+
"bash", "-c", run_script,
109+
]
110+
111+
try:
112+
result = subprocess.run(
113+
cmd, capture_output=True, text=True, timeout=timeout
114+
)
115+
except subprocess.TimeoutExpired:
116+
return False, None, f"Verifier timed out ({timeout}s)"
117+
118+
stdout = result.stdout.strip()
119+
stderr = result.stderr.strip()
120+
121+
# Check reward file
122+
reward_txt = logs_dir / "verifier" / "reward.txt"
123+
reward_json = logs_dir / "verifier" / "reward.json"
124+
125+
reward = None
126+
if reward_txt.exists():
127+
try:
128+
reward = float(reward_txt.read_text().strip())
129+
except ValueError:
130+
return False, None, f"reward.txt not a float: {reward_txt.read_text()!r}"
131+
elif reward_json.exists():
132+
try:
133+
data = json.loads(reward_json.read_text())
134+
reward = float(data.get("reward", data.get("score", 0)))
135+
except (json.JSONDecodeError, ValueError) as e:
136+
return False, None, f"reward.json parse error: {e}"
137+
else:
138+
# Read stderr for diagnostics
139+
stderr_file = logs_dir / "verifier" / "test-stderr.txt"
140+
stderr_content = ""
141+
if stderr_file.exists():
142+
stderr_content = stderr_file.read_text()[-500:]
143+
return False, None, (
144+
f"No reward file produced (exit={result.returncode})\n"
145+
f"stdout: {stdout[-300:]}\n"
146+
f"stderr: {stderr_content}"
147+
)
148+
149+
# Read verifier stderr for scoring details
150+
stderr_file = logs_dir / "verifier" / "test-stderr.txt"
151+
details = ""
152+
if stderr_file.exists():
153+
details = stderr_file.read_text()
154+
155+
return True, reward, details
156+
157+
158+
def load_expected_defects(task_dir: Path) -> list[dict] | None:
159+
"""Load expected_defects.json if it exists."""
160+
path = task_dir / "tests" / "expected_defects.json"
161+
if path.exists():
162+
with open(path) as f:
163+
return json.load(f)
164+
return None
165+
166+
167+
def make_detection_review(defects: list[dict]) -> str:
168+
"""Create a review.json that matches all expected defect files (detection only)."""
169+
entries = []
170+
for d in defects:
171+
entries.append({
172+
"file": d["file"],
173+
"severity": d.get("severity", "medium"),
174+
"description": f"Smoke test detection for {d['id']}",
175+
})
176+
return json.dumps(entries, indent=2)
177+
178+
179+
def smoke_test_task(
180+
task_dir: Path,
181+
build_timeout: int = 300,
182+
verify_timeout: int = 120,
183+
verbose: bool = False,
184+
) -> dict:
185+
"""Run the full smoke test for one task.
186+
187+
Returns dict with: task, passed, tests (list of individual test results).
188+
"""
189+
task_name = task_dir.name
190+
tag = f"ccb-smoke-artifact-{task_name}-{uuid.uuid4().hex[:8]}"
191+
tests_dir = task_dir / "tests"
192+
results = {"task": task_name, "passed": True, "tests": [], "tag": tag}
193+
194+
# --- Test 0: Build ---
195+
ok, msg = build_image(task_dir, tag, timeout=build_timeout)
196+
results["tests"].append({
197+
"name": "docker_build",
198+
"passed": ok,
199+
"message": msg,
200+
})
201+
if not ok:
202+
results["passed"] = False
203+
return results
204+
205+
try:
206+
# --- Test 1: No artifact → score 0.0 ---
207+
ok, reward, details = run_verifier(tag, tests_dir, timeout=verify_timeout)
208+
test1 = {
209+
"name": "no_artifact",
210+
"passed": ok and reward is not None and abs(reward) < 0.01,
211+
"reward": reward,
212+
"expected": 0.0,
213+
"message": "Verifier handles missing artifact gracefully",
214+
}
215+
if not ok:
216+
test1["message"] = f"FAILED: {details[:300]}"
217+
test1["passed"] = False
218+
elif reward is not None and abs(reward) > 0.01:
219+
test1["message"] = f"Expected ~0.0, got {reward}"
220+
test1["passed"] = False
221+
results["tests"].append(test1)
222+
if verbose and details:
223+
print(f" [no_artifact] scoring: {details[:200]}", file=sys.stderr)
224+
225+
# --- Test 2: Detection-only review.json → ~0.50 ---
226+
defects = load_expected_defects(task_dir)
227+
if defects:
228+
mock_review = make_detection_review(defects)
229+
mock_files = {"/workspace/review.json": mock_review}
230+
231+
ok, reward, details = run_verifier(
232+
tag, tests_dir, mock_files=mock_files, timeout=verify_timeout
233+
)
234+
# Expected range: detection F1 should be ~1.0 (0.5 weight) and
235+
# some fix patterns may match base code. Range is intentionally
236+
# wide — the smoke test verifies the verifier RUNS, not scoring
237+
# calibration. Any score in [0.05, 0.95] shows the pipeline works.
238+
test2 = {
239+
"name": "detection_only",
240+
"passed": ok and reward is not None and 0.05 <= reward <= 0.95,
241+
"reward": reward,
242+
"expected_range": [0.05, 0.95],
243+
"message": "Detection-only review scores in expected range",
244+
}
245+
if not ok:
246+
test2["message"] = f"FAILED: {details[:300]}"
247+
test2["passed"] = False
248+
elif reward is not None and not (0.05 <= reward <= 0.95):
249+
test2["message"] = f"Expected 0.05-0.95, got {reward}"
250+
test2["passed"] = False
251+
results["tests"].append(test2)
252+
if verbose and details:
253+
print(f" [detection_only] scoring: {details[:300]}", file=sys.stderr)
254+
255+
# Update overall pass
256+
results["passed"] = all(t["passed"] for t in results["tests"])
257+
258+
finally:
259+
# Clean up image
260+
subprocess.run(
261+
["docker", "image", "rm", "-f", tag],
262+
capture_output=True, timeout=30,
263+
)
264+
265+
return results
266+
267+
268+
def main():
269+
parser = argparse.ArgumentParser(description="Smoke-test artifact-only verifiers")
270+
group = parser.add_mutually_exclusive_group(required=True)
271+
group.add_argument("--task", type=str, help="Path to a single task directory")
272+
group.add_argument("--suite", type=str, help="Suite name (e.g., ccb_test)")
273+
group.add_argument("--all", action="store_true", help="All tasks with Dockerfile.artifact_only")
274+
parser.add_argument("--build-timeout", type=int, default=300, help="Docker build timeout (s)")
275+
parser.add_argument("--verify-timeout", type=int, default=120, help="Verifier timeout (s)")
276+
parser.add_argument("--verbose", "-v", action="store_true")
277+
parser.add_argument("--json", action="store_true", help="Output JSON")
278+
args = parser.parse_args()
279+
280+
if args.task:
281+
task_dir = Path(args.task).resolve()
282+
if not task_dir.exists():
283+
# Try relative to ROOT/benchmarks
284+
task_dir = ROOT / args.task
285+
if not task_dir.exists():
286+
print(f"Task directory not found: {args.task}", file=sys.stderr)
287+
sys.exit(1)
288+
tasks = [task_dir]
289+
elif args.suite:
290+
tasks = find_artifact_tasks(suite=args.suite)
291+
else:
292+
tasks = find_artifact_tasks()
293+
294+
if not tasks:
295+
print("No tasks with Dockerfile.artifact_only found.", file=sys.stderr)
296+
sys.exit(1)
297+
298+
print(f"Smoke-testing {len(tasks)} artifact-only task(s)...\n")
299+
all_results = []
300+
n_pass = 0
301+
n_fail = 0
302+
303+
for task_dir in tasks:
304+
task_name = task_dir.name
305+
print(f" {task_name}...", end=" ", flush=True)
306+
307+
result = smoke_test_task(
308+
task_dir,
309+
build_timeout=args.build_timeout,
310+
verify_timeout=args.verify_timeout,
311+
verbose=args.verbose,
312+
)
313+
all_results.append(result)
314+
315+
if result["passed"]:
316+
n_pass += 1
317+
tests_summary = ", ".join(
318+
f"{t['name']}={'OK' if t['passed'] else 'FAIL'}" for t in result["tests"]
319+
)
320+
print(f"PASS ({tests_summary})")
321+
else:
322+
n_fail += 1
323+
for t in result["tests"]:
324+
if not t["passed"]:
325+
print(f"FAIL ({t['name']}: {t['message'][:100]})")
326+
break
327+
328+
print(f"\n{'='*50}")
329+
print(f"Results: {n_pass} passed, {n_fail} failed out of {len(tasks)}")
330+
331+
if args.json:
332+
print(json.dumps(all_results, indent=2))
333+
334+
sys.exit(1 if n_fail > 0 else 0)
335+
336+
337+
if __name__ == "__main__":
338+
main()

0 commit comments

Comments
 (0)