|
| 1 | +#!/usr/bin/env bash |
| 2 | +# Reward: find_and_prove (0.0-1.0) — 2-phase verification with majority-of-3 voting |
| 3 | +# |
| 4 | +# Phase 1 (0.5): Agent's regression test FAILS on buggy code (>=2 of 3 runs fail) |
| 5 | +# Phase 2 (0.5): Agent's regression test PASSES after applying reference patch (>=2 of 3 runs pass) |
| 6 | +# |
| 7 | +# Environment variables (set by each task's test.sh before sourcing): |
| 8 | +# AGENT_TEST_PATH — path to agent-written test (default: /workspace/regression_test.py) |
| 9 | +# REFERENCE_PATCH — path to known-good patch (default: /tests/reference_fix.patch) |
| 10 | +# TEST_COMMAND — command to run the test (default: python3 -m pytest) |
| 11 | +# PATCH_APPLY_DIR — directory to apply patch in (default: /workspace) |
| 12 | + |
| 13 | +set -euo pipefail |
| 14 | + |
| 15 | +# --- Defaults --- |
| 16 | +AGENT_TEST_PATH="${AGENT_TEST_PATH:-/workspace/regression_test.py}" |
| 17 | +REFERENCE_PATCH="${REFERENCE_PATCH:-/tests/reference_fix.patch}" |
| 18 | +TEST_COMMAND="${TEST_COMMAND:-python3 -m pytest}" |
| 19 | +PATCH_APPLY_DIR="${PATCH_APPLY_DIR:-/workspace}" |
| 20 | + |
| 21 | +# --- Logging setup --- |
| 22 | +LOG_DIR="/logs/verifier" |
| 23 | +mkdir -p "$LOG_DIR" |
| 24 | + |
| 25 | +PHASE1_LOG="$LOG_DIR/phase1.log" |
| 26 | +PHASE2_LOG="$LOG_DIR/phase2.log" |
| 27 | +SUMMARY_LOG="$LOG_DIR/summary.log" |
| 28 | +REWARD_FILE="$LOG_DIR/reward.txt" |
| 29 | + |
| 30 | +# --- Helper: write final score and exit --- |
| 31 | +write_score() { |
| 32 | + local score="$1" |
| 33 | + echo "$score" > "$REWARD_FILE" |
| 34 | + echo "=== FINAL SCORE: $score ===" >> "$SUMMARY_LOG" |
| 35 | + echo "$score" |
| 36 | + exit 0 |
| 37 | +} |
| 38 | + |
| 39 | +# --- Edge case: agent test does not exist or is empty --- |
| 40 | +if [[ ! -f "$AGENT_TEST_PATH" ]]; then |
| 41 | + echo "ERROR: Agent test not found at $AGENT_TEST_PATH" >> "$SUMMARY_LOG" |
| 42 | + write_score "0.0" |
| 43 | +fi |
| 44 | + |
| 45 | +if [[ ! -s "$AGENT_TEST_PATH" ]]; then |
| 46 | + echo "ERROR: Agent test is empty at $AGENT_TEST_PATH" >> "$SUMMARY_LOG" |
| 47 | + write_score "0.0" |
| 48 | +fi |
| 49 | + |
| 50 | +echo "=== Find and Prove Verifier ===" > "$SUMMARY_LOG" |
| 51 | +echo "AGENT_TEST_PATH=$AGENT_TEST_PATH" >> "$SUMMARY_LOG" |
| 52 | +echo "REFERENCE_PATCH=$REFERENCE_PATCH" >> "$SUMMARY_LOG" |
| 53 | +echo "TEST_COMMAND=$TEST_COMMAND" >> "$SUMMARY_LOG" |
| 54 | +echo "PATCH_APPLY_DIR=$PATCH_APPLY_DIR" >> "$SUMMARY_LOG" |
| 55 | +echo "" >> "$SUMMARY_LOG" |
| 56 | + |
| 57 | +# --- Phase 1: Test should FAIL on buggy code (>=2 of 3 fail) --- |
| 58 | +echo "=== PHASE 1: Verify test fails on buggy code ===" >> "$SUMMARY_LOG" |
| 59 | +echo "=== PHASE 1: Verify test fails on buggy code ===" > "$PHASE1_LOG" |
| 60 | + |
| 61 | +phase1_failures=0 |
| 62 | +for run in 1 2 3; do |
| 63 | + echo "--- Phase 1, run $run ---" >> "$PHASE1_LOG" |
| 64 | + set +e |
| 65 | + timeout 60 $TEST_COMMAND "$AGENT_TEST_PATH" >> "$PHASE1_LOG" 2>&1 |
| 66 | + exit_code=$? |
| 67 | + set -e |
| 68 | + |
| 69 | + if [[ $exit_code -ne 0 ]]; then |
| 70 | + phase1_failures=$((phase1_failures + 1)) |
| 71 | + echo "Run $run: FAILED (exit $exit_code) — expected" >> "$PHASE1_LOG" |
| 72 | + else |
| 73 | + echo "Run $run: PASSED (exit 0) — unexpected" >> "$PHASE1_LOG" |
| 74 | + fi |
| 75 | +done |
| 76 | + |
| 77 | +phase1_pass=0 |
| 78 | +if [[ $phase1_failures -ge 2 ]]; then |
| 79 | + phase1_pass=1 |
| 80 | + echo "Phase 1 PASSED: $phase1_failures/3 runs failed (>=2 required)" >> "$SUMMARY_LOG" |
| 81 | +else |
| 82 | + echo "Phase 1 FAILED: only $phase1_failures/3 runs failed (<2 required)" >> "$SUMMARY_LOG" |
| 83 | +fi |
| 84 | + |
| 85 | +# --- Edge case: no reference patch --- |
| 86 | +if [[ ! -f "$REFERENCE_PATCH" ]]; then |
| 87 | + echo "WARNING: Reference patch not found at $REFERENCE_PATCH — skipping Phase 2" >> "$SUMMARY_LOG" |
| 88 | + if [[ $phase1_pass -eq 1 ]]; then |
| 89 | + write_score "0.5" |
| 90 | + else |
| 91 | + write_score "0.0" |
| 92 | + fi |
| 93 | +fi |
| 94 | + |
| 95 | +# --- Phase 2: Apply patch, test should PASS (>=2 of 3 pass) --- |
| 96 | +echo "=== PHASE 2: Verify test passes after fix ===" >> "$SUMMARY_LOG" |
| 97 | +echo "=== PHASE 2: Verify test passes after fix ===" > "$PHASE2_LOG" |
| 98 | + |
| 99 | +# Apply the reference patch |
| 100 | +echo "Applying patch: git apply $REFERENCE_PATCH in $PATCH_APPLY_DIR" >> "$PHASE2_LOG" |
| 101 | +set +e |
| 102 | +(cd "$PATCH_APPLY_DIR" && git apply "$REFERENCE_PATCH") >> "$PHASE2_LOG" 2>&1 |
| 103 | +apply_exit=$? |
| 104 | +set -e |
| 105 | + |
| 106 | +if [[ $apply_exit -ne 0 ]]; then |
| 107 | + echo "ERROR: Patch application failed (exit $apply_exit)" >> "$PHASE2_LOG" |
| 108 | + echo "ERROR: Patch application failed — scoring Phase 1 only" >> "$SUMMARY_LOG" |
| 109 | + if [[ $phase1_pass -eq 1 ]]; then |
| 110 | + write_score "0.5" |
| 111 | + else |
| 112 | + write_score "0.0" |
| 113 | + fi |
| 114 | +fi |
| 115 | + |
| 116 | +phase2_passes=0 |
| 117 | +for run in 1 2 3; do |
| 118 | + echo "--- Phase 2, run $run ---" >> "$PHASE2_LOG" |
| 119 | + set +e |
| 120 | + timeout 60 $TEST_COMMAND "$AGENT_TEST_PATH" >> "$PHASE2_LOG" 2>&1 |
| 121 | + exit_code=$? |
| 122 | + set -e |
| 123 | + |
| 124 | + if [[ $exit_code -eq 0 ]]; then |
| 125 | + phase2_passes=$((phase2_passes + 1)) |
| 126 | + echo "Run $run: PASSED (exit 0) — expected" >> "$PHASE2_LOG" |
| 127 | + else |
| 128 | + echo "Run $run: FAILED (exit $exit_code) — unexpected" >> "$PHASE2_LOG" |
| 129 | + fi |
| 130 | +done |
| 131 | + |
| 132 | +phase2_pass=0 |
| 133 | +if [[ $phase2_passes -ge 2 ]]; then |
| 134 | + phase2_pass=1 |
| 135 | + echo "Phase 2 PASSED: $phase2_passes/3 runs passed (>=2 required)" >> "$SUMMARY_LOG" |
| 136 | +else |
| 137 | + echo "Phase 2 FAILED: only $phase2_passes/3 runs passed (<2 required)" >> "$SUMMARY_LOG" |
| 138 | +fi |
| 139 | + |
| 140 | +# --- Compute final score --- |
| 141 | +score="0.0" |
| 142 | +if [[ $phase1_pass -eq 1 && $phase2_pass -eq 1 ]]; then |
| 143 | + score="1.0" |
| 144 | +elif [[ $phase1_pass -eq 1 ]]; then |
| 145 | + score="0.5" |
| 146 | +elif [[ $phase2_pass -eq 1 ]]; then |
| 147 | + score="0.5" |
| 148 | +fi |
| 149 | + |
| 150 | +write_score "$score" |
0 commit comments