@@ -17,10 +17,48 @@ TASK_OUTPUT="${TASK_OUTPUT:-$TASK_WORKDIR/solution.json}"
1717cd " $TASK_REPO_ROOT " || { echo " ERROR: Cannot cd to $TASK_REPO_ROOT " ; exit 1; }
1818mkdir -p /logs/verifier
1919
20+ write_validation_failure () {
21+ local status=" $1 "
22+ local code=" $2 "
23+ local message=" $3 "
24+ local stage=" $4 "
25+ local primary_path=" $5 "
26+ python3 - " $status " " $code " " $message " " $stage " " $primary_path " << 'PYEOF '
27+ import json
28+ import sys
29+
30+ status, code, message, stage, primary_path = sys.argv[1:6]
31+ payload = {
32+ "schema_version": "validation_result.v1alpha1",
33+ "status": status,
34+ "scorable": False,
35+ "scorer_family": "semantic_retrieval_qa",
36+ "reward": 0.0,
37+ "pass_threshold": 1.0,
38+ "passed": False,
39+ "output_contract": {
40+ "mode": "solution_json",
41+ "primary_path": primary_path,
42+ "required_artifact": True,
43+ },
44+ "sub_scores": {},
45+ "failure": {
46+ "code": code,
47+ "message": message,
48+ "stage": stage,
49+ },
50+ }
51+ with open("/logs/verifier/validation_result.json", "w") as f:
52+ json.dump(payload, f, indent=2)
53+ PYEOF
54+ }
55+
2056if [ ! -f /tests/ground_truth.json ]; then
2157 echo " ERROR: No ground_truth.json found at /tests/ground_truth.json"
2258 echo ' {"score": 0.0}' > /logs/verifier/reward.json
2359 echo " 0.0" > /logs/verifier/reward.txt
60+ write_validation_failure " verifier_error" " missing_ground_truth" \
61+ " ground_truth.json not found at /tests/ground_truth.json" " verifier_runtime" " $TASK_OUTPUT "
2462 exit 0
2563fi
2664
@@ -29,12 +67,14 @@ if [ ! -f "$SOLUTION_FILE" ]; then
2967 echo " ERROR: Agent did not create solution.json in /app/"
3068 echo ' {"score": 0.0}' > /logs/verifier/reward.json
3169 echo " 0.0" > /logs/verifier/reward.txt
70+ write_validation_failure " invalid_output" " missing_required_output" \
71+ " solution.json not found at $TASK_OUTPUT " " output_validation" " $TASK_OUTPUT "
3272 exit 0
3373fi
3474
3575VERIFY_SCRIPT=" $( mktemp /logs/verifier/verify_XXXXXX.py) "
3676cat > " $VERIFY_SCRIPT " << 'PYEOF '
37- import json, os, sys, re
77+ import json, os, sys, re, traceback
3878sys.path.insert(0, "/tests")
3979from verifiers import SemanticRetrievalQAVerifier
4080
5090
5191 verifier = SemanticRetrievalQAVerifier(ground_truth)
5292 result = verifier.verify(agent_output)
53- reward = {"score": float(result.correct_function)}
93+ score = float(result.correct_function)
94+ reward = {"score": score}
95+ validation_result = {
96+ "schema_version": "validation_result.v1alpha1",
97+ "status": "scored",
98+ "scorable": True,
99+ "scorer_family": "semantic_retrieval_qa",
100+ "reward": score,
101+ "pass_threshold": 1.0,
102+ "passed": score >= 1.0,
103+ "output_contract": {
104+ "mode": "solution_json",
105+ "primary_path": os.environ.get("TASK_OUTPUT", "/app/solution.json"),
106+ "required_artifact": True,
107+ },
108+ "sub_scores": {
109+ "correct_function": float(result.correct_function),
110+ "correct_path": float(result.correct_path),
111+ "justification_score": float(result.justification_score),
112+ },
113+ "failure": None,
114+ "details": {
115+ "reasoning": result.reasoning,
116+ },
117+ "legacy": {
118+ "reward_json": reward,
119+ },
120+ }
54121
55122 print(f"Correct Function: {result.correct_function:.2f}")
56123 print(f"Correct Path: {result.correct_path:.2f}")
@@ -59,18 +126,47 @@ try:
59126
60127 with open("/logs/verifier/reward.json", "w") as f:
61128 json.dump(reward, f, indent=2)
129+ with open("/logs/verifier/validation_result.json", "w") as f:
130+ json.dump(validation_result, f, indent=2)
62131 with open("/logs/verifier/reward.txt", "w") as f:
63132 f.write(str(reward["score"]))
64133except Exception as e:
65- import traceback
66134 print(f"ERROR: {e}")
67135 traceback.print_exc()
136+ validation_result = {
137+ "schema_version": "validation_result.v1alpha1",
138+ "status": "verifier_error",
139+ "scorable": False,
140+ "scorer_family": "semantic_retrieval_qa",
141+ "reward": 0.0,
142+ "pass_threshold": 1.0,
143+ "passed": False,
144+ "output_contract": {
145+ "mode": "solution_json",
146+ "primary_path": os.environ.get("TASK_OUTPUT", "/app/solution.json"),
147+ "required_artifact": True,
148+ },
149+ "sub_scores": {},
150+ "failure": {
151+ "code": "verifier_exception",
152+ "message": str(e),
153+ "stage": "scoring",
154+ },
155+ "details": {
156+ "traceback": traceback.format_exc(),
157+ },
158+ "legacy": {
159+ "reward_json": {"score": 0.0},
160+ },
161+ }
68162 with open("/logs/verifier/reward.json", "w") as f:
69163 json.dump({"score": 0.0}, f)
164+ with open("/logs/verifier/validation_result.json", "w") as f:
165+ json.dump(validation_result, f, indent=2)
70166 with open("/logs/verifier/reward.txt", "w") as f:
71167 f.write("0.0")
72168PYEOF
73169
74170python3 " $VERIFY_SCRIPT " 2>&1 | tee /logs/verifier/verify-debug.log
75- exit 0
76171rm -f " $VERIFY_SCRIPT "
172+ exit 0
0 commit comments