sourcegraph
diff --git a/‎benchmarks/ccb_build/camel-fix-protocol-feat-001/tests/answer_json_verifier_lib.sh‎
Lines changed: 308 additions & 0 deletions b/‎benchmarks/ccb_build/camel-fix-protocol-feat-001/tests/answer_json_verifier_lib.sh‎
Lines changed: 308 additions & 0 deletions
diff --git a/‎benchmarks/ccb_build/camel-fix-protocol-feat-001/tests/test.sh‎
Lines changed: 11 additions & 1 deletion b/‎benchmarks/ccb_build/camel-fix-protocol-feat-001/tests/test.sh‎
Lines changed: 11 additions & 1 deletion
@@ -0,0 +1,308 @@
+#!/bin/bash
+# answer_json_verifier_lib.sh — Unified answer.json verifier for artifact configs.
+#
+# Source this at the TOP of test.sh. It detects /tmp/.artifact_only_mode and:
+#   1. Validates /workspace/answer.json exists and is valid JSON
+#   2. Extracts analysis.reasoning → $ANALYSIS_TEXT_FILE (for keyword/pattern scoring)
+#   3. Extracts analysis.files_examined → $ANALYSIS_FILES_FILE (for IR metrics)
+#   4. If changes[] has diffs: copies /repo_full → /tmp/verify_repo, applies all diffs
+#   5. Exports VERIFY_REPO, ARTIFACT_ONLY, ANALYSIS_TEXT_FILE, etc.
+#
+# For non-artifact-only runs, this script is a no-op that sets safe defaults.
+#
+# Usage in test.sh:
+#   #!/bin/bash
+#   set -e
+#   # Artifact mode: parse answer.json, apply patches, export analysis
+#   if [ -f /tmp/.artifact_only_mode ] && [ -f /tests/answer_json_verifier_lib.sh ]; then
+#       source /tests/answer_json_verifier_lib.sh
+#   fi
+#   # ... rest of test.sh uses $VERIFY_REPO, $ANALYSIS_TEXT_FILE, etc. ...
+
+if [ ! -f /tmp/.artifact_only_mode ]; then
+    # Not in artifact-only mode — export defaults for backward compat
+    export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
+    export ARTIFACT_ONLY=false
+    export ANALYSIS_TEXT_FILE=""
+    export ANALYSIS_FILES_FILE=""
+    export ANSWER_JSON=""
+    return 0 2>/dev/null || true
+fi
+
+echo "[answer_json_verifier] Detected artifact-only mode"
+export ARTIFACT_ONLY=true
+export ANSWER_JSON="/workspace/answer.json"
+export ANALYSIS_TEXT_FILE="/tmp/analysis.txt"
+export ANALYSIS_FILES_FILE="/tmp/analysis_files.json"
+
+# ── Validate answer.json ──────────────────────────────────────────────────
+
+if [ ! -f "$ANSWER_JSON" ]; then
+    echo "[answer_json_verifier] ERROR: /workspace/answer.json not found"
+    echo "[answer_json_verifier] Agent did not produce required artifact"
+    export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
+    # Signal to test.sh that there's no output — it should score 0
+    return 0 2>/dev/null || true
+fi
+
+# Validate JSON and extract fields using Python
+python3 - "$ANSWER_JSON" <<'PYEOF'
+import json, sys, os, subprocess, tempfile, re
+
+answer_path = sys.argv[1]
+analysis_text_file = os.environ.get("ANALYSIS_TEXT_FILE", "/tmp/analysis.txt")
+analysis_files_file = os.environ.get("ANALYSIS_FILES_FILE", "/tmp/analysis_files.json")
+
+# ── Parse answer.json ─────────────────────────────────────────────────────
+try:
+    with open(answer_path) as f:
+        raw = f.read()
+
+    # Strip markdown code fences if agent wrapped JSON in ```json blocks
+    m = re.search(r'```(?:json)?\s*\n(.*?)```', raw, re.DOTALL)
+    if m:
+        raw = m.group(1).strip()
+
+    answer = json.loads(raw)
+    if not isinstance(answer, dict):
+        print("[answer_json_verifier] WARNING: answer.json is not a JSON object", file=sys.stderr)
+        answer = {}
+except (json.JSONDecodeError, ValueError) as e:
+    print(f"[answer_json_verifier] ERROR: Failed to parse answer.json: {e}", file=sys.stderr)
+    answer = {}
+except FileNotFoundError:
+    print("[answer_json_verifier] ERROR: answer.json not found", file=sys.stderr)
+    answer = {}
+
+# ── Extract analysis fields ───────────────────────────────────────────────
+analysis = answer.get("analysis", {})
+if not isinstance(analysis, dict):
+    analysis = {}
+
+# Build analysis text from summary + reasoning (what verifiers will grep)
+parts = []
+summary = analysis.get("summary", "")
+if summary:
+    parts.append(summary)
+reasoning = analysis.get("reasoning", "")
+if reasoning:
+    parts.append(reasoning)
+analysis_text = "\n\n".join(parts)
+
+with open(analysis_text_file, "w") as f:
+    f.write(analysis_text)
+print(f"[answer_json_verifier] Wrote analysis text ({len(analysis_text)} chars) to {analysis_text_file}")
+
+# Extract files_examined for IR metrics
+files_examined = analysis.get("files_examined", [])
+if not isinstance(files_examined, list):
+    files_examined = []
+with open(analysis_files_file, "w") as f:
+    json.dump(files_examined, f, indent=2)
+print(f"[answer_json_verifier] Wrote {len(files_examined)} examined files to {analysis_files_file}")
+
+# ── Extract and apply diffs from changes[] ────────────────────────────────
+changes = answer.get("changes", [])
+if not isinstance(changes, list):
+    changes = []
+
+if not changes:
+    print("[answer_json_verifier] No changes[] in answer.json (analysis-only task)")
+    # Signal no patches needed
+    with open("/tmp/.answer_json_no_changes", "w") as f:
+        f.write("1")
+
+# ── Generate synthetic review.json for code-review verifiers ──────────────
+# Code-review verifiers expect /workspace/review.json with [{file, description, fix_patch}]
+# Generate this from answer.json changes[] so existing F1 scoring works unchanged.
+if changes:
+    review_entries = []
+    for change in changes:
+        entry = {
+            "file": change.get("file", ""),
+            "description": change.get("description", ""),
+            "fix_patch": change.get("diff", ""),
+        }
+        review_entries.append(entry)
+    review_json_path = "/workspace/review.json"
+    with open(review_json_path, "w") as f:
+        json.dump(review_entries, f, indent=2)
+    print(f"[answer_json_verifier] Generated synthetic review.json ({len(review_entries)} entries)")
+
+# ── Extract new-file diffs to /workspace/ ─────────────────────────────────
+# For find-and-prove tasks: agent writes regression tests as new-file diffs.
+# Extract file content from diffs like "--- /dev/null\n+++ b/regression_test.py"
+# and write directly to /workspace/.
+new_files_written = 0
+for change in changes:
+    diff_text = change.get("diff", "")
+    file_path = change.get("file", "")
+    if not diff_text or not file_path:
+        continue
+
+    # Detect new-file diff: starts from /dev/null
+    if "/dev/null" in diff_text:
+        # Extract added lines (lines starting with +, excluding +++ header)
+        lines = diff_text.split("\n")
+        content_lines = []
+        in_hunk = False
+        for line in lines:
+            if line.startswith("@@"):
+                in_hunk = True
+                continue
+            if in_hunk:
+                if line.startswith("+"):
+                    content_lines.append(line[1:])  # Strip leading +
+                elif line.startswith("-"):
+                    pass  # skip removed lines (shouldn't exist in new-file)
+                elif line.startswith("\\"):
+                    pass  # "\ No newline at end of file"
+                else:
+                    content_lines.append(line)  # context line
+
+        if content_lines:
+            # Determine target path — use file field, write to /workspace/
+            target = os.path.join("/workspace", os.path.basename(file_path))
+            os.makedirs(os.path.dirname(target), exist_ok=True)
+            with open(target, "w") as f:
+                f.write("\n".join(content_lines))
+                if content_lines and not content_lines[-1] == "":
+                    f.write("\n")
+            new_files_written += 1
+            print(f"[answer_json_verifier] Extracted new file: {target}")
+
+if new_files_written > 0:
+    print(f"[answer_json_verifier] Extracted {new_files_written} new files to /workspace/")
+
+# ── Generate fault_localization_result.json ────────────────────────────────
+# Fault-loc verifiers expect /workspace/fault_localization_result.json with
+# {buggy_files, buggy_functions, reasoning, confidence}. Populate from analysis.
+if analysis:
+    fl_result = {}
+    # buggy_files: extract from files_examined
+    fl_files = [fe.get("path", "") for fe in files_examined if fe.get("path")]
+    if fl_files:
+        fl_result["buggy_files"] = fl_files
+    # buggy_functions: look for a "functions" or "buggy_functions" key in analysis
+    fl_funcs = analysis.get("buggy_functions", analysis.get("functions", []))
+    if isinstance(fl_funcs, list) and fl_funcs:
+        fl_result["buggy_functions"] = fl_funcs
+    # reasoning: use the full analysis text
+    if reasoning:
+        fl_result["reasoning"] = reasoning
+    # confidence: look for a confidence key
+    confidence = analysis.get("confidence", None)
+    if isinstance(confidence, (int, float)):
+        fl_result["confidence"] = confidence
+    # Only write if we have substantive content
+    fl_path = "/workspace/fault_localization_result.json"
+    if fl_result and not os.path.exists(fl_path):
+        with open(fl_path, "w") as f:
+            json.dump(fl_result, f, indent=2)
+        print(f"[answer_json_verifier] Generated fault_localization_result.json")
+
+# ── Apply diffs to verify_repo (for code-change verification) ─────────────
+if not changes:
+    sys.exit(0)
+
+# We have diffs to apply — need /repo_full
+verify_repo = "/tmp/verify_repo"
+repo_full = "/repo_full"
+
+if not os.path.isdir(repo_full):
+    print(f"[answer_json_verifier] WARNING: {repo_full} not found. Cannot apply diffs.")
+    with open("/tmp/.answer_json_no_changes", "w") as f:
+        f.write("1")
+    sys.exit(0)
+
+# Copy /repo_full to /tmp/verify_repo
+print(f"[answer_json_verifier] Copying {repo_full} -> {verify_repo}...")
+subprocess.run(["rm", "-rf", verify_repo], check=True)
+subprocess.run(["cp", "-a", repo_full, verify_repo], check=True)
+subprocess.run(
+    ["git", "config", "--global", "--add", "safe.directory", verify_repo],
+    capture_output=True
+)
+
+# Apply each diff
+applied = 0
+failed = 0
+
+for entry in changes:
+    diff_text = entry.get("diff", "")
+    if not diff_text or not diff_text.strip():
+        continue
+
+    file_name = entry.get("file", "unknown")
+
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.patch', delete=False, dir='/tmp') as pf:
+        pf.write(diff_text)
+        pf.flush()
+        pf_path = pf.name
+
+    # Try git apply (strictest)
+    result = subprocess.run(
+        ["git", "apply", "--allow-empty", pf_path],
+        cwd=verify_repo, capture_output=True, text=True
+    )
+    if result.returncode == 0:
+        applied += 1
+        os.unlink(pf_path)
+        print(f"[answer_json_verifier] Applied diff for {file_name} (git apply)")
+        continue
+
+    # Fallback: patch -p1 --fuzz=3
+    result = subprocess.run(
+        ["patch", "-p1", "--fuzz=3", "-i", pf_path],
+        cwd=verify_repo, capture_output=True, text=True
+    )
+    if result.returncode == 0:
+        applied += 1
+        os.unlink(pf_path)
+        print(f"[answer_json_verifier] Applied diff for {file_name} (patch -p1)")
+        continue
+
+    # Fallback: git apply --3way
+    result = subprocess.run(
+        ["git", "apply", "--allow-empty", "--3way", pf_path],
+        cwd=verify_repo, capture_output=True, text=True
+    )
+    if result.returncode == 0:
+        applied += 1
+        os.unlink(pf_path)
+        print(f"[answer_json_verifier] Applied diff for {file_name} (git apply --3way)")
+        continue
+
+    failed += 1
+    print(f"[answer_json_verifier] WARNING: Diff for {file_name} failed to apply", file=sys.stderr)
+    os.unlink(pf_path)
+
+print(f"[answer_json_verifier] Diffs applied: {applied}, failed: {failed}")
+
+# Write verify_repo path for shell to pick up
+with open("/tmp/.answer_json_verify_repo", "w") as f:
+    f.write(verify_repo)
+PYEOF
+
+# Pick up VERIFY_REPO from Python output
+if [ -f /tmp/.answer_json_verify_repo ]; then
+    export VERIFY_REPO="$(cat /tmp/.answer_json_verify_repo)"
+    cd "$VERIFY_REPO"
+    echo "[answer_json_verifier] VERIFY_REPO set to $VERIFY_REPO"
+elif [ -f /tmp/.answer_json_no_changes ]; then
+    # Analysis-only: no repo copy needed, use /workspace or /repo_full as fallback
+    if [ -d /repo_full ]; then
+        export VERIFY_REPO="/repo_full"
+    else
+        export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
+    fi
+    echo "[answer_json_verifier] Analysis-only mode, VERIFY_REPO=$VERIFY_REPO"
+else
+    export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
+    echo "[answer_json_verifier] WARNING: Using fallback VERIFY_REPO=$VERIFY_REPO"
+fi
+
+# Clean up temp markers
+rm -f /tmp/.answer_json_verify_repo /tmp/.answer_json_no_changes
+
+echo "[answer_json_verifier] Library loaded (ARTIFACT_ONLY=$ARTIFACT_ONLY, VERIFY_REPO=$VERIFY_REPO)"
@@ -6,8 +6,12 @@ set -e
 
 # sg_only_env: restore full repo before verification (no-op for regular runs)
 [ -f /tmp/.sg_only_mode ] && [ -f /tests/sgonly_verifier_wrapper.sh ] && source /tests/sgonly_verifier_wrapper.sh
+# Artifact mode: parse answer.json, extract analysis text, apply diffs
+if [ -f /tmp/.artifact_only_mode ] && [ -f /tests/answer_json_verifier_lib.sh ]; then
+    source /tests/answer_json_verifier_lib.sh
+fi
 
-cd /workspace
+cd "${VERIFY_REPO:-/workspace}"
 mkdir -p /logs/verifier
 
 git config --global --add safe.directory /workspace 2>/dev/null || true
@@ -38,6 +42,12 @@ fi
 echo "Change detection: unstaged=$UNSTAGED_COUNT staged=$STAGED_COUNT untracked=$UNTRACKED_COUNT commits=$COMMIT_COUNT"
 
 SOLUTION_FILE="/logs/agent/solution.md"
+# In artifact mode, populate expected output from answer.json analysis
+if [ "${ARTIFACT_ONLY:-false}" = "true" ] && [ -f "${ANALYSIS_TEXT_FILE:-}" ]; then
+    mkdir -p "/logs/agent"
+    cp "$ANALYSIS_TEXT_FILE" "/logs/agent/solution.md"
+    echo "[answer_json] Copied analysis text to /logs/agent/solution.md"
+fi
 HAS_CHANGES=0
 if [ "$UNSTAGED_COUNT" -gt 0 ] || [ "$STAGED_COUNT" -gt 0 ] || [ "$UNTRACKED_COUNT" -gt 0 ] || [ "$COMMIT_COUNT" -gt 0 ]; then
     HAS_CHANGES=1