Skip to content

Commit a83a435

Browse files
committed
benchmarks: migrate onboarding repoqa verifiers
1 parent 8d10cc1 commit a83a435

File tree

9 files changed

+823
-87
lines changed

9 files changed

+823
-87
lines changed

benchmarks/csb_org_onboarding/ccx-onboard-search-207/tests/test.sh

Lines changed: 100 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,48 @@ TASK_OUTPUT="${TASK_OUTPUT:-$TASK_WORKDIR/solution.json}"
1717
cd "$TASK_REPO_ROOT" || { echo "ERROR: Cannot cd to $TASK_REPO_ROOT"; exit 1; }
1818
mkdir -p /logs/verifier
1919

20+
write_validation_failure() {
21+
local status="$1"
22+
local code="$2"
23+
local message="$3"
24+
local stage="$4"
25+
local primary_path="$5"
26+
python3 - "$status" "$code" "$message" "$stage" "$primary_path" <<'PYEOF'
27+
import json
28+
import sys
29+
30+
status, code, message, stage, primary_path = sys.argv[1:6]
31+
payload = {
32+
"schema_version": "validation_result.v1alpha1",
33+
"status": status,
34+
"scorable": False,
35+
"scorer_family": "semantic_retrieval_qa",
36+
"reward": 0.0,
37+
"pass_threshold": 1.0,
38+
"passed": False,
39+
"output_contract": {
40+
"mode": "solution_json",
41+
"primary_path": primary_path,
42+
"required_artifact": True,
43+
},
44+
"sub_scores": {},
45+
"failure": {
46+
"code": code,
47+
"message": message,
48+
"stage": stage,
49+
},
50+
}
51+
with open("/logs/verifier/validation_result.json", "w") as f:
52+
json.dump(payload, f, indent=2)
53+
PYEOF
54+
}
55+
2056
if [ ! -f /tests/ground_truth.json ]; then
2157
echo "ERROR: No ground_truth.json found at /tests/ground_truth.json"
2258
echo '{"score": 0.0}' > /logs/verifier/reward.json
2359
echo "0.0" > /logs/verifier/reward.txt
60+
write_validation_failure "verifier_error" "missing_ground_truth" \
61+
"ground_truth.json not found at /tests/ground_truth.json" "verifier_runtime" "$TASK_OUTPUT"
2462
exit 0
2563
fi
2664

@@ -29,12 +67,14 @@ if [ ! -f "$SOLUTION_FILE" ]; then
2967
echo "ERROR: Agent did not create solution.json in /app/"
3068
echo '{"score": 0.0}' > /logs/verifier/reward.json
3169
echo "0.0" > /logs/verifier/reward.txt
70+
write_validation_failure "invalid_output" "missing_required_output" \
71+
"solution.json not found at $TASK_OUTPUT" "output_validation" "$TASK_OUTPUT"
3272
exit 0
3373
fi
3474

3575
VERIFY_SCRIPT="$(mktemp /logs/verifier/verify_XXXXXX.py)"
3676
cat > "$VERIFY_SCRIPT" << 'PYEOF'
37-
import json, os, sys, re
77+
import json, os, sys, re, traceback
3878
sys.path.insert(0, "/tests")
3979
from verifiers import SemanticRetrievalQAVerifier
4080
@@ -50,7 +90,34 @@ try:
5090
5191
verifier = SemanticRetrievalQAVerifier(ground_truth)
5292
result = verifier.verify(agent_output)
53-
reward = {"score": float(result.correct_function)}
93+
score = float(result.correct_function)
94+
reward = {"score": score}
95+
validation_result = {
96+
"schema_version": "validation_result.v1alpha1",
97+
"status": "scored",
98+
"scorable": True,
99+
"scorer_family": "semantic_retrieval_qa",
100+
"reward": score,
101+
"pass_threshold": 1.0,
102+
"passed": score >= 1.0,
103+
"output_contract": {
104+
"mode": "solution_json",
105+
"primary_path": os.environ.get("TASK_OUTPUT", "/app/solution.json"),
106+
"required_artifact": True,
107+
},
108+
"sub_scores": {
109+
"correct_function": float(result.correct_function),
110+
"correct_path": float(result.correct_path),
111+
"justification_score": float(result.justification_score),
112+
},
113+
"failure": None,
114+
"details": {
115+
"reasoning": result.reasoning,
116+
},
117+
"legacy": {
118+
"reward_json": reward,
119+
},
120+
}
54121
55122
print(f"Correct Function: {result.correct_function:.2f}")
56123
print(f"Correct Path: {result.correct_path:.2f}")
@@ -59,18 +126,47 @@ try:
59126
60127
with open("/logs/verifier/reward.json", "w") as f:
61128
json.dump(reward, f, indent=2)
129+
with open("/logs/verifier/validation_result.json", "w") as f:
130+
json.dump(validation_result, f, indent=2)
62131
with open("/logs/verifier/reward.txt", "w") as f:
63132
f.write(str(reward["score"]))
64133
except Exception as e:
65-
import traceback
66134
print(f"ERROR: {e}")
67135
traceback.print_exc()
136+
validation_result = {
137+
"schema_version": "validation_result.v1alpha1",
138+
"status": "verifier_error",
139+
"scorable": False,
140+
"scorer_family": "semantic_retrieval_qa",
141+
"reward": 0.0,
142+
"pass_threshold": 1.0,
143+
"passed": False,
144+
"output_contract": {
145+
"mode": "solution_json",
146+
"primary_path": os.environ.get("TASK_OUTPUT", "/app/solution.json"),
147+
"required_artifact": True,
148+
},
149+
"sub_scores": {},
150+
"failure": {
151+
"code": "verifier_exception",
152+
"message": str(e),
153+
"stage": "scoring",
154+
},
155+
"details": {
156+
"traceback": traceback.format_exc(),
157+
},
158+
"legacy": {
159+
"reward_json": {"score": 0.0},
160+
},
161+
}
68162
with open("/logs/verifier/reward.json", "w") as f:
69163
json.dump({"score": 0.0}, f)
164+
with open("/logs/verifier/validation_result.json", "w") as f:
165+
json.dump(validation_result, f, indent=2)
70166
with open("/logs/verifier/reward.txt", "w") as f:
71167
f.write("0.0")
72168
PYEOF
73169

74170
python3 "$VERIFY_SCRIPT" 2>&1 | tee /logs/verifier/verify-debug.log
75-
exit 0
76171
rm -f "$VERIFY_SCRIPT"
172+
exit 0

benchmarks/csb_org_onboarding/ccx-onboard-search-208/tests/test.sh

Lines changed: 100 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,48 @@ TASK_OUTPUT="${TASK_OUTPUT:-$TASK_WORKDIR/solution.json}"
1717
cd "$TASK_REPO_ROOT" || { echo "ERROR: Cannot cd to $TASK_REPO_ROOT"; exit 1; }
1818
mkdir -p /logs/verifier
1919

20+
write_validation_failure() {
21+
local status="$1"
22+
local code="$2"
23+
local message="$3"
24+
local stage="$4"
25+
local primary_path="$5"
26+
python3 - "$status" "$code" "$message" "$stage" "$primary_path" <<'PYEOF'
27+
import json
28+
import sys
29+
30+
status, code, message, stage, primary_path = sys.argv[1:6]
31+
payload = {
32+
"schema_version": "validation_result.v1alpha1",
33+
"status": status,
34+
"scorable": False,
35+
"scorer_family": "semantic_retrieval_qa",
36+
"reward": 0.0,
37+
"pass_threshold": 1.0,
38+
"passed": False,
39+
"output_contract": {
40+
"mode": "solution_json",
41+
"primary_path": primary_path,
42+
"required_artifact": True,
43+
},
44+
"sub_scores": {},
45+
"failure": {
46+
"code": code,
47+
"message": message,
48+
"stage": stage,
49+
},
50+
}
51+
with open("/logs/verifier/validation_result.json", "w") as f:
52+
json.dump(payload, f, indent=2)
53+
PYEOF
54+
}
55+
2056
if [ ! -f /tests/ground_truth.json ]; then
2157
echo "ERROR: No ground_truth.json found at /tests/ground_truth.json"
2258
echo '{"score": 0.0}' > /logs/verifier/reward.json
2359
echo "0.0" > /logs/verifier/reward.txt
60+
write_validation_failure "verifier_error" "missing_ground_truth" \
61+
"ground_truth.json not found at /tests/ground_truth.json" "verifier_runtime" "$TASK_OUTPUT"
2462
exit 0
2563
fi
2664

@@ -29,12 +67,14 @@ if [ ! -f "$SOLUTION_FILE" ]; then
2967
echo "ERROR: Agent did not create solution.json in /app/"
3068
echo '{"score": 0.0}' > /logs/verifier/reward.json
3169
echo "0.0" > /logs/verifier/reward.txt
70+
write_validation_failure "invalid_output" "missing_required_output" \
71+
"solution.json not found at $TASK_OUTPUT" "output_validation" "$TASK_OUTPUT"
3272
exit 0
3373
fi
3474

3575
VERIFY_SCRIPT="$(mktemp /logs/verifier/verify_XXXXXX.py)"
3676
cat > "$VERIFY_SCRIPT" << 'PYEOF'
37-
import json, os, sys, re
77+
import json, os, sys, re, traceback
3878
sys.path.insert(0, "/tests")
3979
from verifiers import SemanticRetrievalQAVerifier
4080
@@ -50,7 +90,34 @@ try:
5090
5191
verifier = SemanticRetrievalQAVerifier(ground_truth)
5292
result = verifier.verify(agent_output)
53-
reward = {"score": float(result.correct_function)}
93+
score = float(result.correct_function)
94+
reward = {"score": score}
95+
validation_result = {
96+
"schema_version": "validation_result.v1alpha1",
97+
"status": "scored",
98+
"scorable": True,
99+
"scorer_family": "semantic_retrieval_qa",
100+
"reward": score,
101+
"pass_threshold": 1.0,
102+
"passed": score >= 1.0,
103+
"output_contract": {
104+
"mode": "solution_json",
105+
"primary_path": os.environ.get("TASK_OUTPUT", "/app/solution.json"),
106+
"required_artifact": True,
107+
},
108+
"sub_scores": {
109+
"correct_function": float(result.correct_function),
110+
"correct_path": float(result.correct_path),
111+
"justification_score": float(result.justification_score),
112+
},
113+
"failure": None,
114+
"details": {
115+
"reasoning": result.reasoning,
116+
},
117+
"legacy": {
118+
"reward_json": reward,
119+
},
120+
}
54121
55122
print(f"Correct Function: {result.correct_function:.2f}")
56123
print(f"Correct Path: {result.correct_path:.2f}")
@@ -59,18 +126,47 @@ try:
59126
60127
with open("/logs/verifier/reward.json", "w") as f:
61128
json.dump(reward, f, indent=2)
129+
with open("/logs/verifier/validation_result.json", "w") as f:
130+
json.dump(validation_result, f, indent=2)
62131
with open("/logs/verifier/reward.txt", "w") as f:
63132
f.write(str(reward["score"]))
64133
except Exception as e:
65-
import traceback
66134
print(f"ERROR: {e}")
67135
traceback.print_exc()
136+
validation_result = {
137+
"schema_version": "validation_result.v1alpha1",
138+
"status": "verifier_error",
139+
"scorable": False,
140+
"scorer_family": "semantic_retrieval_qa",
141+
"reward": 0.0,
142+
"pass_threshold": 1.0,
143+
"passed": False,
144+
"output_contract": {
145+
"mode": "solution_json",
146+
"primary_path": os.environ.get("TASK_OUTPUT", "/app/solution.json"),
147+
"required_artifact": True,
148+
},
149+
"sub_scores": {},
150+
"failure": {
151+
"code": "verifier_exception",
152+
"message": str(e),
153+
"stage": "scoring",
154+
},
155+
"details": {
156+
"traceback": traceback.format_exc(),
157+
},
158+
"legacy": {
159+
"reward_json": {"score": 0.0},
160+
},
161+
}
68162
with open("/logs/verifier/reward.json", "w") as f:
69163
json.dump({"score": 0.0}, f)
164+
with open("/logs/verifier/validation_result.json", "w") as f:
165+
json.dump(validation_result, f, indent=2)
70166
with open("/logs/verifier/reward.txt", "w") as f:
71167
f.write("0.0")
72168
PYEOF
73169

74170
python3 "$VERIFY_SCRIPT" 2>&1 | tee /logs/verifier/verify-debug.log
75-
exit 0
76171
rm -f "$VERIFY_SCRIPT"
172+
exit 0

0 commit comments

Comments
 (0)