Skip to content

Commit 6542883

Browse files
LoCoBench Botclaude
andcommitted
feat: US-005 - Create F1 scorer template for structured JSON output
Add benchmarks/templates/f1_json_scorer.sh — a reusable scorer for tasks that output structured JSON. Computes precision, recall, and F1 by matching entries on configurable composite key fields from ground_truth.json. Handles edge cases: empty output, missing ground truth, malformed JSON, and markdown-fenced JSON. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 111b5fe commit 6542883

1 file changed

Lines changed: 186 additions & 0 deletions

File tree

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
#!/bin/bash
2+
# Reward: F1 (0.0-1.0) — precision/recall scoring of structured JSON output
3+
#
4+
# SHARED F1 SCORER TEMPLATE
5+
# --------------------------
6+
# For tasks where the agent outputs structured JSON (e.g., callers.json,
7+
# implementors.json, symbols.json) that must be evaluated against a ground
8+
# truth set. Computes precision, recall, and F1 score by matching entries
9+
# on a configurable composite key.
10+
#
11+
# To use: copy to your task's tests/test.sh and set OUTPUT_PATH to the
12+
# expected agent output path. Provide a ground_truth.json in the same
13+
# tests/ directory.
14+
#
15+
# ground_truth.json schema:
16+
# {
17+
# "key_fields": ["repo", "file", "function"],
18+
# "entries": [
19+
# {"repo": "org/repo", "file": "src/foo.ts", "function": "handleSearch"},
20+
# ...
21+
# ]
22+
# }
23+
#
24+
# Agent output schema (at OUTPUT_PATH):
25+
# [
26+
# {"repo": "org/repo", "file": "src/foo.ts", "function": "handleSearch"},
27+
# ...
28+
# ]
29+
#
30+
# Matching: Two entries match when all key_fields have equal values (case-sensitive).
31+
# Each ground truth entry can match at most one reported entry (no double-counting).
32+
#
33+
# Metrics:
34+
# precision = true_positives / total_reported
35+
# recall = true_positives / total_expected
36+
# F1 = 2 * precision * recall / (precision + recall)
37+
38+
set -e
39+
40+
# ── Configurable paths ────────────────────────────────────────────────────
41+
# Override OUTPUT_PATH per task:
42+
# crossrepo callers: /workspace/callers.json
43+
# crossrepo symbols: /workspace/symbols.json
44+
# generic: /workspace/output.json
45+
OUTPUT_PATH="${OUTPUT_PATH:-/workspace/output.json}"
46+
GROUND_TRUTH="${GROUND_TRUTH:-/tests/ground_truth.json}"
47+
REWARD_FILE="/logs/verifier/reward.txt"
48+
49+
mkdir -p /logs/verifier
50+
51+
# ── Check prerequisites ───────────────────────────────────────────────────
52+
if [ ! -f "$GROUND_TRUTH" ]; then
53+
echo "ERROR: ground_truth.json not found at $GROUND_TRUTH"
54+
echo "0.0" > "$REWARD_FILE"
55+
exit 0
56+
fi
57+
58+
if [ ! -f "$OUTPUT_PATH" ]; then
59+
echo "No agent output found at $OUTPUT_PATH"
60+
echo "Agent did not produce the required output."
61+
echo "0.0" > "$REWARD_FILE"
62+
exit 0
63+
fi
64+
65+
echo "Scoring agent output..."
66+
echo "Output: $OUTPUT_PATH"
67+
echo "Ground truth: $GROUND_TRUTH"
68+
echo ""
69+
70+
# ── Delegate scoring to Python ────────────────────────────────────────────
71+
OUTPUT_PATH="$OUTPUT_PATH" GROUND_TRUTH="$GROUND_TRUTH" REWARD_FILE="$REWARD_FILE" \
72+
python3 << 'PYEOF'
73+
import json, os, re, sys
74+
75+
OUTPUT_PATH = os.environ["OUTPUT_PATH"]
76+
GT_PATH = os.environ["GROUND_TRUTH"]
77+
REWARD_PATH = os.environ["REWARD_FILE"]
78+
79+
def write_reward(score):
80+
"""Write score to reward file and print summary."""
81+
with open(REWARD_PATH, "w") as f:
82+
f.write(f"{score:.2f}\n")
83+
print(f"\nTests completed - Score: {score:.2f}")
84+
85+
def strip_code_fences(text):
86+
"""Strip markdown code fences if agent wrapped JSON in ```json blocks."""
87+
m = re.search(r'```(?:json)?\s*\n(.*?)```', text, re.DOTALL)
88+
return m.group(1).strip() if m else text.strip()
89+
90+
# ── Load ground truth ────────────────────────────────────────────────────
91+
with open(GT_PATH) as f:
92+
gt = json.load(f)
93+
94+
key_fields = gt.get("key_fields", [])
95+
expected = gt.get("entries", [])
96+
97+
if not key_fields:
98+
print("ERROR: ground_truth.json must specify 'key_fields' (list of field names)")
99+
write_reward(0.0)
100+
sys.exit(0)
101+
102+
if not expected:
103+
print("ERROR: ground_truth.json has no entries")
104+
write_reward(0.0)
105+
sys.exit(0)
106+
107+
num_expected = len(expected)
108+
109+
# ── Load agent output ────────────────────────────────────────────────────
110+
try:
111+
with open(OUTPUT_PATH) as f:
112+
raw = f.read()
113+
raw = strip_code_fences(raw)
114+
reported = json.loads(raw)
115+
if not isinstance(reported, list):
116+
print("Agent output is not a JSON array — scoring as empty.")
117+
reported = []
118+
except (json.JSONDecodeError, ValueError) as e:
119+
print(f"Malformed JSON in agent output: {e}")
120+
reported = []
121+
122+
num_reported = len(reported)
123+
124+
if num_reported == 0:
125+
print("Agent output is empty — no entries to score.")
126+
print(f"Expected {num_expected} entries.")
127+
write_reward(0.0)
128+
sys.exit(0)
129+
130+
# ── Build composite keys ─────────────────────────────────────────────────
131+
def make_key(entry, fields):
132+
"""Build a composite key tuple from an entry's field values."""
133+
return tuple(str(entry.get(f, "")).strip() for f in fields)
134+
135+
expected_keys = [make_key(e, key_fields) for e in expected]
136+
reported_keys = [make_key(r, key_fields) for r in reported]
137+
138+
# ── Match reported against expected (one match per expected entry) ────────
139+
matched_expected = set()
140+
true_positives = 0
141+
142+
for r_idx, r_key in enumerate(reported_keys):
143+
for e_idx, e_key in enumerate(expected_keys):
144+
if e_idx in matched_expected:
145+
continue
146+
if r_key == e_key:
147+
matched_expected.add(e_idx)
148+
true_positives += 1
149+
break
150+
151+
# ── Compute metrics ──────────────────────────────────────────────────────
152+
precision = true_positives / num_reported if num_reported > 0 else 0.0
153+
recall = true_positives / num_expected if num_expected > 0 else 0.0
154+
155+
if precision + recall > 0:
156+
f1 = 2 * precision * recall / (precision + recall)
157+
else:
158+
f1 = 0.0
159+
160+
# ── Print detailed results ───────────────────────────────────────────────
161+
print("=== F1 Scoring ===")
162+
print(f" Key fields: {key_fields}")
163+
print(f" Expected entries: {num_expected}")
164+
print(f" Reported entries: {num_reported}")
165+
print(f" True positives: {true_positives}")
166+
print(f" False positives: {num_reported - true_positives}")
167+
print(f" False negatives: {num_expected - true_positives}")
168+
print()
169+
print(f" Precision: {precision:.3f}")
170+
print(f" Recall: {recall:.3f}")
171+
print(f" F1: {f1:.3f}")
172+
173+
# ── Show matched and missed entries ──────────────────────────────────────
174+
if true_positives > 0:
175+
print(f"\n=== Matched ({true_positives}) ===")
176+
for e_idx in sorted(matched_expected):
177+
print(f" [x] {dict(zip(key_fields, expected_keys[e_idx]))}")
178+
179+
missed = [i for i in range(num_expected) if i not in matched_expected]
180+
if missed:
181+
print(f"\n=== Missed ({len(missed)}) ===")
182+
for e_idx in missed:
183+
print(f" [ ] {dict(zip(key_fields, expected_keys[e_idx]))}")
184+
185+
write_reward(f1)
186+
PYEOF

0 commit comments

Comments
 (0)