Skip to content

Commit 6086039

Browse files
LoCoBench Botclaude
andcommitted
feat: US-007 - Create 1 TypeScript task scaffolding
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 37edd96 commit 6086039

File tree

6 files changed

+168
-0
lines changed

6 files changed

+168
-0
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# TODO: Set up buggy codebase at correct commit with test dependencies
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<!-- TODO: Write symptom-only bug description. Do NOT leak file paths, function names, or fix strategies. -->
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
[task]
2+
name = "navprove-tutanota-search-001"
3+
difficulty = "hard"
4+
time_limit_sec = 1800
5+
language = "typescript"
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
#!/usr/bin/env bash
2+
# Reward: find_and_prove (0.0-1.0) — 2-phase verification with majority-of-3 voting
3+
#
4+
# Phase 1 (0.5): Agent's regression test FAILS on buggy code (>=2 of 3 runs fail)
5+
# Phase 2 (0.5): Agent's regression test PASSES after applying reference patch (>=2 of 3 runs pass)
6+
#
7+
# Environment variables (set by each task's test.sh before sourcing):
8+
# AGENT_TEST_PATH — path to agent-written test (default: /workspace/regression_test.py)
9+
# REFERENCE_PATCH — path to known-good patch (default: /tests/reference_fix.patch)
10+
# TEST_COMMAND — command to run the test (default: python3 -m pytest)
11+
# PATCH_APPLY_DIR — directory to apply patch in (default: /workspace)
12+
13+
set -euo pipefail
14+
15+
# --- Defaults ---
16+
AGENT_TEST_PATH="${AGENT_TEST_PATH:-/workspace/regression_test.py}"
17+
REFERENCE_PATCH="${REFERENCE_PATCH:-/tests/reference_fix.patch}"
18+
TEST_COMMAND="${TEST_COMMAND:-python3 -m pytest}"
19+
PATCH_APPLY_DIR="${PATCH_APPLY_DIR:-/workspace}"
20+
21+
# --- Logging setup ---
22+
LOG_DIR="/logs/verifier"
23+
mkdir -p "$LOG_DIR"
24+
25+
PHASE1_LOG="$LOG_DIR/phase1.log"
26+
PHASE2_LOG="$LOG_DIR/phase2.log"
27+
SUMMARY_LOG="$LOG_DIR/summary.log"
28+
REWARD_FILE="$LOG_DIR/reward.txt"
29+
30+
# --- Helper: write final score and exit ---
31+
write_score() {
32+
local score="$1"
33+
echo "$score" > "$REWARD_FILE"
34+
echo "=== FINAL SCORE: $score ===" >> "$SUMMARY_LOG"
35+
echo "$score"
36+
exit 0
37+
}
38+
39+
# --- Edge case: agent test does not exist or is empty ---
40+
if [[ ! -f "$AGENT_TEST_PATH" ]]; then
41+
echo "ERROR: Agent test not found at $AGENT_TEST_PATH" >> "$SUMMARY_LOG"
42+
write_score "0.0"
43+
fi
44+
45+
if [[ ! -s "$AGENT_TEST_PATH" ]]; then
46+
echo "ERROR: Agent test is empty at $AGENT_TEST_PATH" >> "$SUMMARY_LOG"
47+
write_score "0.0"
48+
fi
49+
50+
echo "=== Find and Prove Verifier ===" > "$SUMMARY_LOG"
51+
echo "AGENT_TEST_PATH=$AGENT_TEST_PATH" >> "$SUMMARY_LOG"
52+
echo "REFERENCE_PATCH=$REFERENCE_PATCH" >> "$SUMMARY_LOG"
53+
echo "TEST_COMMAND=$TEST_COMMAND" >> "$SUMMARY_LOG"
54+
echo "PATCH_APPLY_DIR=$PATCH_APPLY_DIR" >> "$SUMMARY_LOG"
55+
echo "" >> "$SUMMARY_LOG"
56+
57+
# --- Phase 1: Test should FAIL on buggy code (>=2 of 3 fail) ---
58+
echo "=== PHASE 1: Verify test fails on buggy code ===" >> "$SUMMARY_LOG"
59+
echo "=== PHASE 1: Verify test fails on buggy code ===" > "$PHASE1_LOG"
60+
61+
phase1_failures=0
62+
for run in 1 2 3; do
63+
echo "--- Phase 1, run $run ---" >> "$PHASE1_LOG"
64+
set +e
65+
timeout 60 $TEST_COMMAND "$AGENT_TEST_PATH" >> "$PHASE1_LOG" 2>&1
66+
exit_code=$?
67+
set -e
68+
69+
if [[ $exit_code -ne 0 ]]; then
70+
phase1_failures=$((phase1_failures + 1))
71+
echo "Run $run: FAILED (exit $exit_code) — expected" >> "$PHASE1_LOG"
72+
else
73+
echo "Run $run: PASSED (exit 0) — unexpected" >> "$PHASE1_LOG"
74+
fi
75+
done
76+
77+
phase1_pass=0
78+
if [[ $phase1_failures -ge 2 ]]; then
79+
phase1_pass=1
80+
echo "Phase 1 PASSED: $phase1_failures/3 runs failed (>=2 required)" >> "$SUMMARY_LOG"
81+
else
82+
echo "Phase 1 FAILED: only $phase1_failures/3 runs failed (<2 required)" >> "$SUMMARY_LOG"
83+
fi
84+
85+
# --- Edge case: no reference patch ---
86+
if [[ ! -f "$REFERENCE_PATCH" ]]; then
87+
echo "WARNING: Reference patch not found at $REFERENCE_PATCH — skipping Phase 2" >> "$SUMMARY_LOG"
88+
if [[ $phase1_pass -eq 1 ]]; then
89+
write_score "0.5"
90+
else
91+
write_score "0.0"
92+
fi
93+
fi
94+
95+
# --- Phase 2: Apply patch, test should PASS (>=2 of 3 pass) ---
96+
echo "=== PHASE 2: Verify test passes after fix ===" >> "$SUMMARY_LOG"
97+
echo "=== PHASE 2: Verify test passes after fix ===" > "$PHASE2_LOG"
98+
99+
# Apply the reference patch
100+
echo "Applying patch: git apply $REFERENCE_PATCH in $PATCH_APPLY_DIR" >> "$PHASE2_LOG"
101+
set +e
102+
(cd "$PATCH_APPLY_DIR" && git apply "$REFERENCE_PATCH") >> "$PHASE2_LOG" 2>&1
103+
apply_exit=$?
104+
set -e
105+
106+
if [[ $apply_exit -ne 0 ]]; then
107+
echo "ERROR: Patch application failed (exit $apply_exit)" >> "$PHASE2_LOG"
108+
echo "ERROR: Patch application failed — scoring Phase 1 only" >> "$SUMMARY_LOG"
109+
if [[ $phase1_pass -eq 1 ]]; then
110+
write_score "0.5"
111+
else
112+
write_score "0.0"
113+
fi
114+
fi
115+
116+
phase2_passes=0
117+
for run in 1 2 3; do
118+
echo "--- Phase 2, run $run ---" >> "$PHASE2_LOG"
119+
set +e
120+
timeout 60 $TEST_COMMAND "$AGENT_TEST_PATH" >> "$PHASE2_LOG" 2>&1
121+
exit_code=$?
122+
set -e
123+
124+
if [[ $exit_code -eq 0 ]]; then
125+
phase2_passes=$((phase2_passes + 1))
126+
echo "Run $run: PASSED (exit 0) — expected" >> "$PHASE2_LOG"
127+
else
128+
echo "Run $run: FAILED (exit $exit_code) — unexpected" >> "$PHASE2_LOG"
129+
fi
130+
done
131+
132+
phase2_pass=0
133+
if [[ $phase2_passes -ge 2 ]]; then
134+
phase2_pass=1
135+
echo "Phase 2 PASSED: $phase2_passes/3 runs passed (>=2 required)" >> "$SUMMARY_LOG"
136+
else
137+
echo "Phase 2 FAILED: only $phase2_passes/3 runs passed (<2 required)" >> "$SUMMARY_LOG"
138+
fi
139+
140+
# --- Compute final score ---
141+
score="0.0"
142+
if [[ $phase1_pass -eq 1 && $phase2_pass -eq 1 ]]; then
143+
score="1.0"
144+
elif [[ $phase1_pass -eq 1 ]]; then
145+
score="0.5"
146+
elif [[ $phase2_pass -eq 1 ]]; then
147+
score="0.5"
148+
fi
149+
150+
write_score "$score"
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# TODO: Add the known-good patch (git diff format)
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#!/usr/bin/env bash
2+
# Verifier for navprove-tutanota-search-001
3+
# Sources the shared find_and_prove_verifier to run 2-phase majority-of-3 verification.
4+
5+
export AGENT_TEST_PATH="/workspace/regression_test.test.ts"
6+
export TEST_COMMAND="npx jest --timeout=60000"
7+
export REFERENCE_PATCH="/tests/reference_fix.patch"
8+
export PATCH_APPLY_DIR="/workspace"
9+
10+
source /tests/find_and_prove_verifier.sh

0 commit comments

Comments
 (0)