Skip to content

Commit 14c2821

Browse files
sjarmakclaude
andcommitted
feat: Wave 3 answer.json integration for ccb_fix + ccb_secure + ccb_build suites
Completes the unified answer.json artifact evaluation across all SDLC suites. 69 tasks integrated (1 TAC task skipped) with auto-detected verifier categories: - 15 test_ratio (SWE-bench Pro): source block + cd VERIFY_REPO replacement - 5 diff_similarity (PyTorch): source block + cd replacement - 13 ir_checklist: source block + solution.md redirect + cd replacement - 17 checklist_code: source block + cd replacement - 8 triage: source block + triage.md redirect - 8 dibench: source block only (best-effort) - 2 f1_scorer: source block only - 1 semantic_similarity: source block only Key Wave 3 addition: VERIFY_REPO-aware cd replacement for code-change verifiers. `cd /workspace` becomes `cd "${VERIFY_REPO:-/workspace}"` making verifiers work in artifact, sg_only, and normal modes without behavior change. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent c99cde8 commit 14c2821

File tree

139 files changed

+22202
-50
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

139 files changed

+22202
-50
lines changed
Lines changed: 308 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,308 @@
1+
#!/bin/bash
2+
# answer_json_verifier_lib.sh — Unified answer.json verifier for artifact configs.
3+
#
4+
# Source this at the TOP of test.sh. It detects /tmp/.artifact_only_mode and:
5+
# 1. Validates /workspace/answer.json exists and is valid JSON
6+
# 2. Extracts analysis.reasoning → $ANALYSIS_TEXT_FILE (for keyword/pattern scoring)
7+
# 3. Extracts analysis.files_examined → $ANALYSIS_FILES_FILE (for IR metrics)
8+
# 4. If changes[] has diffs: copies /repo_full → /tmp/verify_repo, applies all diffs
9+
# 5. Exports VERIFY_REPO, ARTIFACT_ONLY, ANALYSIS_TEXT_FILE, etc.
10+
#
11+
# For non-artifact-only runs, this script is a no-op that sets safe defaults.
12+
#
13+
# Usage in test.sh:
14+
# #!/bin/bash
15+
# set -e
16+
# # Artifact mode: parse answer.json, apply patches, export analysis
17+
# if [ -f /tmp/.artifact_only_mode ] && [ -f /tests/answer_json_verifier_lib.sh ]; then
18+
# source /tests/answer_json_verifier_lib.sh
19+
# fi
20+
# # ... rest of test.sh uses $VERIFY_REPO, $ANALYSIS_TEXT_FILE, etc. ...
21+
22+
if [ ! -f /tmp/.artifact_only_mode ]; then
23+
# Not in artifact-only mode — export defaults for backward compat
24+
export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
25+
export ARTIFACT_ONLY=false
26+
export ANALYSIS_TEXT_FILE=""
27+
export ANALYSIS_FILES_FILE=""
28+
export ANSWER_JSON=""
29+
return 0 2>/dev/null || true
30+
fi
31+
32+
echo "[answer_json_verifier] Detected artifact-only mode"
33+
export ARTIFACT_ONLY=true
34+
export ANSWER_JSON="/workspace/answer.json"
35+
export ANALYSIS_TEXT_FILE="/tmp/analysis.txt"
36+
export ANALYSIS_FILES_FILE="/tmp/analysis_files.json"
37+
38+
# ── Validate answer.json ──────────────────────────────────────────────────
39+
40+
if [ ! -f "$ANSWER_JSON" ]; then
41+
echo "[answer_json_verifier] ERROR: /workspace/answer.json not found"
42+
echo "[answer_json_verifier] Agent did not produce required artifact"
43+
export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
44+
# Signal to test.sh that there's no output — it should score 0
45+
return 0 2>/dev/null || true
46+
fi
47+
48+
# Validate JSON and extract fields using Python
49+
python3 - "$ANSWER_JSON" <<'PYEOF'
50+
import json, sys, os, subprocess, tempfile, re
51+
52+
answer_path = sys.argv[1]
53+
analysis_text_file = os.environ.get("ANALYSIS_TEXT_FILE", "/tmp/analysis.txt")
54+
analysis_files_file = os.environ.get("ANALYSIS_FILES_FILE", "/tmp/analysis_files.json")
55+
56+
# ── Parse answer.json ─────────────────────────────────────────────────────
57+
try:
58+
with open(answer_path) as f:
59+
raw = f.read()
60+
61+
# Strip markdown code fences if agent wrapped JSON in ```json blocks
62+
m = re.search(r'```(?:json)?\s*\n(.*?)```', raw, re.DOTALL)
63+
if m:
64+
raw = m.group(1).strip()
65+
66+
answer = json.loads(raw)
67+
if not isinstance(answer, dict):
68+
print("[answer_json_verifier] WARNING: answer.json is not a JSON object", file=sys.stderr)
69+
answer = {}
70+
except (json.JSONDecodeError, ValueError) as e:
71+
print(f"[answer_json_verifier] ERROR: Failed to parse answer.json: {e}", file=sys.stderr)
72+
answer = {}
73+
except FileNotFoundError:
74+
print("[answer_json_verifier] ERROR: answer.json not found", file=sys.stderr)
75+
answer = {}
76+
77+
# ── Extract analysis fields ───────────────────────────────────────────────
78+
analysis = answer.get("analysis", {})
79+
if not isinstance(analysis, dict):
80+
analysis = {}
81+
82+
# Build analysis text from summary + reasoning (what verifiers will grep)
83+
parts = []
84+
summary = analysis.get("summary", "")
85+
if summary:
86+
parts.append(summary)
87+
reasoning = analysis.get("reasoning", "")
88+
if reasoning:
89+
parts.append(reasoning)
90+
analysis_text = "\n\n".join(parts)
91+
92+
with open(analysis_text_file, "w") as f:
93+
f.write(analysis_text)
94+
print(f"[answer_json_verifier] Wrote analysis text ({len(analysis_text)} chars) to {analysis_text_file}")
95+
96+
# Extract files_examined for IR metrics
97+
files_examined = analysis.get("files_examined", [])
98+
if not isinstance(files_examined, list):
99+
files_examined = []
100+
with open(analysis_files_file, "w") as f:
101+
json.dump(files_examined, f, indent=2)
102+
print(f"[answer_json_verifier] Wrote {len(files_examined)} examined files to {analysis_files_file}")
103+
104+
# ── Extract and apply diffs from changes[] ────────────────────────────────
105+
changes = answer.get("changes", [])
106+
if not isinstance(changes, list):
107+
changes = []
108+
109+
if not changes:
110+
print("[answer_json_verifier] No changes[] in answer.json (analysis-only task)")
111+
# Signal no patches needed
112+
with open("/tmp/.answer_json_no_changes", "w") as f:
113+
f.write("1")
114+
115+
# ── Generate synthetic review.json for code-review verifiers ──────────────
116+
# Code-review verifiers expect /workspace/review.json with [{file, description, fix_patch}]
117+
# Generate this from answer.json changes[] so existing F1 scoring works unchanged.
118+
if changes:
119+
review_entries = []
120+
for change in changes:
121+
entry = {
122+
"file": change.get("file", ""),
123+
"description": change.get("description", ""),
124+
"fix_patch": change.get("diff", ""),
125+
}
126+
review_entries.append(entry)
127+
review_json_path = "/workspace/review.json"
128+
with open(review_json_path, "w") as f:
129+
json.dump(review_entries, f, indent=2)
130+
print(f"[answer_json_verifier] Generated synthetic review.json ({len(review_entries)} entries)")
131+
132+
# ── Extract new-file diffs to /workspace/ ─────────────────────────────────
133+
# For find-and-prove tasks: agent writes regression tests as new-file diffs.
134+
# Extract file content from diffs like "--- /dev/null\n+++ b/regression_test.py"
135+
# and write directly to /workspace/.
136+
new_files_written = 0
137+
for change in changes:
138+
diff_text = change.get("diff", "")
139+
file_path = change.get("file", "")
140+
if not diff_text or not file_path:
141+
continue
142+
143+
# Detect new-file diff: starts from /dev/null
144+
if "/dev/null" in diff_text:
145+
# Extract added lines (lines starting with +, excluding +++ header)
146+
lines = diff_text.split("\n")
147+
content_lines = []
148+
in_hunk = False
149+
for line in lines:
150+
if line.startswith("@@"):
151+
in_hunk = True
152+
continue
153+
if in_hunk:
154+
if line.startswith("+"):
155+
content_lines.append(line[1:]) # Strip leading +
156+
elif line.startswith("-"):
157+
pass # skip removed lines (shouldn't exist in new-file)
158+
elif line.startswith("\\"):
159+
pass # "\ No newline at end of file"
160+
else:
161+
content_lines.append(line) # context line
162+
163+
if content_lines:
164+
# Determine target path — use file field, write to /workspace/
165+
target = os.path.join("/workspace", os.path.basename(file_path))
166+
os.makedirs(os.path.dirname(target), exist_ok=True)
167+
with open(target, "w") as f:
168+
f.write("\n".join(content_lines))
169+
if content_lines and not content_lines[-1] == "":
170+
f.write("\n")
171+
new_files_written += 1
172+
print(f"[answer_json_verifier] Extracted new file: {target}")
173+
174+
if new_files_written > 0:
175+
print(f"[answer_json_verifier] Extracted {new_files_written} new files to /workspace/")
176+
177+
# ── Generate fault_localization_result.json ────────────────────────────────
178+
# Fault-loc verifiers expect /workspace/fault_localization_result.json with
179+
# {buggy_files, buggy_functions, reasoning, confidence}. Populate from analysis.
180+
if analysis:
181+
fl_result = {}
182+
# buggy_files: extract from files_examined
183+
fl_files = [fe.get("path", "") for fe in files_examined if fe.get("path")]
184+
if fl_files:
185+
fl_result["buggy_files"] = fl_files
186+
# buggy_functions: look for a "functions" or "buggy_functions" key in analysis
187+
fl_funcs = analysis.get("buggy_functions", analysis.get("functions", []))
188+
if isinstance(fl_funcs, list) and fl_funcs:
189+
fl_result["buggy_functions"] = fl_funcs
190+
# reasoning: use the full analysis text
191+
if reasoning:
192+
fl_result["reasoning"] = reasoning
193+
# confidence: look for a confidence key
194+
confidence = analysis.get("confidence", None)
195+
if isinstance(confidence, (int, float)):
196+
fl_result["confidence"] = confidence
197+
# Only write if we have substantive content
198+
fl_path = "/workspace/fault_localization_result.json"
199+
if fl_result and not os.path.exists(fl_path):
200+
with open(fl_path, "w") as f:
201+
json.dump(fl_result, f, indent=2)
202+
print(f"[answer_json_verifier] Generated fault_localization_result.json")
203+
204+
# ── Apply diffs to verify_repo (for code-change verification) ─────────────
205+
if not changes:
206+
sys.exit(0)
207+
208+
# We have diffs to apply — need /repo_full
209+
verify_repo = "/tmp/verify_repo"
210+
repo_full = "/repo_full"
211+
212+
if not os.path.isdir(repo_full):
213+
print(f"[answer_json_verifier] WARNING: {repo_full} not found. Cannot apply diffs.")
214+
with open("/tmp/.answer_json_no_changes", "w") as f:
215+
f.write("1")
216+
sys.exit(0)
217+
218+
# Copy /repo_full to /tmp/verify_repo
219+
print(f"[answer_json_verifier] Copying {repo_full} -> {verify_repo}...")
220+
subprocess.run(["rm", "-rf", verify_repo], check=True)
221+
subprocess.run(["cp", "-a", repo_full, verify_repo], check=True)
222+
subprocess.run(
223+
["git", "config", "--global", "--add", "safe.directory", verify_repo],
224+
capture_output=True
225+
)
226+
227+
# Apply each diff
228+
applied = 0
229+
failed = 0
230+
231+
for entry in changes:
232+
diff_text = entry.get("diff", "")
233+
if not diff_text or not diff_text.strip():
234+
continue
235+
236+
file_name = entry.get("file", "unknown")
237+
238+
with tempfile.NamedTemporaryFile(mode='w', suffix='.patch', delete=False, dir='/tmp') as pf:
239+
pf.write(diff_text)
240+
pf.flush()
241+
pf_path = pf.name
242+
243+
# Try git apply (strictest)
244+
result = subprocess.run(
245+
["git", "apply", "--allow-empty", pf_path],
246+
cwd=verify_repo, capture_output=True, text=True
247+
)
248+
if result.returncode == 0:
249+
applied += 1
250+
os.unlink(pf_path)
251+
print(f"[answer_json_verifier] Applied diff for {file_name} (git apply)")
252+
continue
253+
254+
# Fallback: patch -p1 --fuzz=3
255+
result = subprocess.run(
256+
["patch", "-p1", "--fuzz=3", "-i", pf_path],
257+
cwd=verify_repo, capture_output=True, text=True
258+
)
259+
if result.returncode == 0:
260+
applied += 1
261+
os.unlink(pf_path)
262+
print(f"[answer_json_verifier] Applied diff for {file_name} (patch -p1)")
263+
continue
264+
265+
# Fallback: git apply --3way
266+
result = subprocess.run(
267+
["git", "apply", "--allow-empty", "--3way", pf_path],
268+
cwd=verify_repo, capture_output=True, text=True
269+
)
270+
if result.returncode == 0:
271+
applied += 1
272+
os.unlink(pf_path)
273+
print(f"[answer_json_verifier] Applied diff for {file_name} (git apply --3way)")
274+
continue
275+
276+
failed += 1
277+
print(f"[answer_json_verifier] WARNING: Diff for {file_name} failed to apply", file=sys.stderr)
278+
os.unlink(pf_path)
279+
280+
print(f"[answer_json_verifier] Diffs applied: {applied}, failed: {failed}")
281+
282+
# Write verify_repo path for shell to pick up
283+
with open("/tmp/.answer_json_verify_repo", "w") as f:
284+
f.write(verify_repo)
285+
PYEOF
286+
287+
# Pick up VERIFY_REPO from Python output
288+
if [ -f /tmp/.answer_json_verify_repo ]; then
289+
export VERIFY_REPO="$(cat /tmp/.answer_json_verify_repo)"
290+
cd "$VERIFY_REPO"
291+
echo "[answer_json_verifier] VERIFY_REPO set to $VERIFY_REPO"
292+
elif [ -f /tmp/.answer_json_no_changes ]; then
293+
# Analysis-only: no repo copy needed, use /workspace or /repo_full as fallback
294+
if [ -d /repo_full ]; then
295+
export VERIFY_REPO="/repo_full"
296+
else
297+
export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
298+
fi
299+
echo "[answer_json_verifier] Analysis-only mode, VERIFY_REPO=$VERIFY_REPO"
300+
else
301+
export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
302+
echo "[answer_json_verifier] WARNING: Using fallback VERIFY_REPO=$VERIFY_REPO"
303+
fi
304+
305+
# Clean up temp markers
306+
rm -f /tmp/.answer_json_verify_repo /tmp/.answer_json_no_changes
307+
308+
echo "[answer_json_verifier] Library loaded (ARTIFACT_ONLY=$ARTIFACT_ONLY, VERIFY_REPO=$VERIFY_REPO)"

benchmarks/ccb_build/camel-fix-protocol-feat-001/tests/test.sh

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,12 @@ set -e
66

77
# sg_only_env: restore full repo before verification (no-op for regular runs)
88
[ -f /tmp/.sg_only_mode ] && [ -f /tests/sgonly_verifier_wrapper.sh ] && source /tests/sgonly_verifier_wrapper.sh
9+
# Artifact mode: parse answer.json, extract analysis text, apply diffs
10+
if [ -f /tmp/.artifact_only_mode ] && [ -f /tests/answer_json_verifier_lib.sh ]; then
11+
source /tests/answer_json_verifier_lib.sh
12+
fi
913

10-
cd /workspace
14+
cd "${VERIFY_REPO:-/workspace}"
1115
mkdir -p /logs/verifier
1216

1317
git config --global --add safe.directory /workspace 2>/dev/null || true
@@ -38,6 +42,12 @@ fi
3842
echo "Change detection: unstaged=$UNSTAGED_COUNT staged=$STAGED_COUNT untracked=$UNTRACKED_COUNT commits=$COMMIT_COUNT"
3943

4044
SOLUTION_FILE="/logs/agent/solution.md"
45+
# In artifact mode, populate expected output from answer.json analysis
46+
if [ "${ARTIFACT_ONLY:-false}" = "true" ] && [ -f "${ANALYSIS_TEXT_FILE:-}" ]; then
47+
mkdir -p "/logs/agent"
48+
cp "$ANALYSIS_TEXT_FILE" "/logs/agent/solution.md"
49+
echo "[answer_json] Copied analysis text to /logs/agent/solution.md"
50+
fi
4151
HAS_CHANGES=0
4252
if [ "$UNSTAGED_COUNT" -gt 0 ] || [ "$STAGED_COUNT" -gt 0 ] || [ "$UNTRACKED_COUNT" -gt 0 ] || [ "$COMMIT_COUNT" -gt 0 ]; then
4353
HAS_CHANGES=1

0 commit comments

Comments
 (0)