|
| 1 | +#!/bin/bash |
| 2 | +# Reward: F1 (0.0-1.0) — precision/recall scoring of structured JSON output |
| 3 | +# |
| 4 | +# SHARED F1 SCORER TEMPLATE |
| 5 | +# -------------------------- |
| 6 | +# For tasks where the agent outputs structured JSON (e.g., callers.json, |
| 7 | +# implementors.json, symbols.json) that must be evaluated against a ground |
| 8 | +# truth set. Computes precision, recall, and F1 score by matching entries |
| 9 | +# on a configurable composite key. |
| 10 | +# |
| 11 | +# To use: copy to your task's tests/test.sh and set OUTPUT_PATH to the |
| 12 | +# expected agent output path. Provide a ground_truth.json in the same |
| 13 | +# tests/ directory. |
| 14 | +# |
| 15 | +# ground_truth.json schema: |
| 16 | +# { |
| 17 | +# "key_fields": ["repo", "file", "function"], |
| 18 | +# "entries": [ |
| 19 | +# {"repo": "org/repo", "file": "src/foo.ts", "function": "handleSearch"}, |
| 20 | +# ... |
| 21 | +# ] |
| 22 | +# } |
| 23 | +# |
| 24 | +# Agent output schema (at OUTPUT_PATH): |
| 25 | +# [ |
| 26 | +# {"repo": "org/repo", "file": "src/foo.ts", "function": "handleSearch"}, |
| 27 | +# ... |
| 28 | +# ] |
| 29 | +# |
| 30 | +# Matching: Two entries match when all key_fields have equal values (case-sensitive). |
| 31 | +# Each ground truth entry can match at most one reported entry (no double-counting). |
| 32 | +# |
| 33 | +# Metrics: |
| 34 | +# precision = true_positives / total_reported |
| 35 | +# recall = true_positives / total_expected |
| 36 | +# F1 = 2 * precision * recall / (precision + recall) |
| 37 | + |
| 38 | +set -e |
| 39 | + |
| 40 | +# ── Configurable paths ──────────────────────────────────────────────────── |
| 41 | +# Override OUTPUT_PATH per task: |
| 42 | +# crossrepo callers: /workspace/callers.json |
| 43 | +# crossrepo symbols: /workspace/symbols.json |
| 44 | +# generic: /workspace/output.json |
| 45 | +OUTPUT_PATH="${OUTPUT_PATH:-/workspace/output.json}" |
| 46 | +GROUND_TRUTH="${GROUND_TRUTH:-/tests/ground_truth.json}" |
| 47 | +REWARD_FILE="/logs/verifier/reward.txt" |
| 48 | + |
| 49 | +mkdir -p /logs/verifier |
| 50 | + |
| 51 | +# ── Check prerequisites ─────────────────────────────────────────────────── |
| 52 | +if [ ! -f "$GROUND_TRUTH" ]; then |
| 53 | + echo "ERROR: ground_truth.json not found at $GROUND_TRUTH" |
| 54 | + echo "0.0" > "$REWARD_FILE" |
| 55 | + exit 0 |
| 56 | +fi |
| 57 | + |
| 58 | +if [ ! -f "$OUTPUT_PATH" ]; then |
| 59 | + echo "No agent output found at $OUTPUT_PATH" |
| 60 | + echo "Agent did not produce the required output." |
| 61 | + echo "0.0" > "$REWARD_FILE" |
| 62 | + exit 0 |
| 63 | +fi |
| 64 | + |
| 65 | +echo "Scoring agent output..." |
| 66 | +echo "Output: $OUTPUT_PATH" |
| 67 | +echo "Ground truth: $GROUND_TRUTH" |
| 68 | +echo "" |
| 69 | + |
| 70 | +# ── Delegate scoring to Python ──────────────────────────────────────────── |
| 71 | +OUTPUT_PATH="$OUTPUT_PATH" GROUND_TRUTH="$GROUND_TRUTH" REWARD_FILE="$REWARD_FILE" \ |
| 72 | +python3 << 'PYEOF' |
| 73 | +import json, os, re, sys |
| 74 | +
|
| 75 | +OUTPUT_PATH = os.environ["OUTPUT_PATH"] |
| 76 | +GT_PATH = os.environ["GROUND_TRUTH"] |
| 77 | +REWARD_PATH = os.environ["REWARD_FILE"] |
| 78 | +
|
| 79 | +def write_reward(score): |
| 80 | + """Write score to reward file and print summary.""" |
| 81 | + with open(REWARD_PATH, "w") as f: |
| 82 | + f.write(f"{score:.2f}\n") |
| 83 | + print(f"\nTests completed - Score: {score:.2f}") |
| 84 | +
|
| 85 | +def strip_code_fences(text): |
| 86 | + """Strip markdown code fences if agent wrapped JSON in ```json blocks.""" |
| 87 | + m = re.search(r'```(?:json)?\s*\n(.*?)```', text, re.DOTALL) |
| 88 | + return m.group(1).strip() if m else text.strip() |
| 89 | +
|
| 90 | +# ── Load ground truth ──────────────────────────────────────────────────── |
| 91 | +with open(GT_PATH) as f: |
| 92 | + gt = json.load(f) |
| 93 | +
|
| 94 | +key_fields = gt.get("key_fields", []) |
| 95 | +expected = gt.get("entries", []) |
| 96 | +
|
| 97 | +if not key_fields: |
| 98 | + print("ERROR: ground_truth.json must specify 'key_fields' (list of field names)") |
| 99 | + write_reward(0.0) |
| 100 | + sys.exit(0) |
| 101 | +
|
| 102 | +if not expected: |
| 103 | + print("ERROR: ground_truth.json has no entries") |
| 104 | + write_reward(0.0) |
| 105 | + sys.exit(0) |
| 106 | +
|
| 107 | +num_expected = len(expected) |
| 108 | +
|
| 109 | +# ── Load agent output ──────────────────────────────────────────────────── |
| 110 | +try: |
| 111 | + with open(OUTPUT_PATH) as f: |
| 112 | + raw = f.read() |
| 113 | + raw = strip_code_fences(raw) |
| 114 | + reported = json.loads(raw) |
| 115 | + if not isinstance(reported, list): |
| 116 | + print("Agent output is not a JSON array — scoring as empty.") |
| 117 | + reported = [] |
| 118 | +except (json.JSONDecodeError, ValueError) as e: |
| 119 | + print(f"Malformed JSON in agent output: {e}") |
| 120 | + reported = [] |
| 121 | +
|
| 122 | +num_reported = len(reported) |
| 123 | +
|
| 124 | +if num_reported == 0: |
| 125 | + print("Agent output is empty — no entries to score.") |
| 126 | + print(f"Expected {num_expected} entries.") |
| 127 | + write_reward(0.0) |
| 128 | + sys.exit(0) |
| 129 | +
|
| 130 | +# ── Build composite keys ───────────────────────────────────────────────── |
| 131 | +def make_key(entry, fields): |
| 132 | + """Build a composite key tuple from an entry's field values.""" |
| 133 | + return tuple(str(entry.get(f, "")).strip() for f in fields) |
| 134 | +
|
| 135 | +expected_keys = [make_key(e, key_fields) for e in expected] |
| 136 | +reported_keys = [make_key(r, key_fields) for r in reported] |
| 137 | +
|
| 138 | +# ── Match reported against expected (one match per expected entry) ──────── |
| 139 | +matched_expected = set() |
| 140 | +true_positives = 0 |
| 141 | +
|
| 142 | +for r_idx, r_key in enumerate(reported_keys): |
| 143 | + for e_idx, e_key in enumerate(expected_keys): |
| 144 | + if e_idx in matched_expected: |
| 145 | + continue |
| 146 | + if r_key == e_key: |
| 147 | + matched_expected.add(e_idx) |
| 148 | + true_positives += 1 |
| 149 | + break |
| 150 | +
|
| 151 | +# ── Compute metrics ────────────────────────────────────────────────────── |
| 152 | +precision = true_positives / num_reported if num_reported > 0 else 0.0 |
| 153 | +recall = true_positives / num_expected if num_expected > 0 else 0.0 |
| 154 | +
|
| 155 | +if precision + recall > 0: |
| 156 | + f1 = 2 * precision * recall / (precision + recall) |
| 157 | +else: |
| 158 | + f1 = 0.0 |
| 159 | +
|
| 160 | +# ── Print detailed results ─────────────────────────────────────────────── |
| 161 | +print("=== F1 Scoring ===") |
| 162 | +print(f" Key fields: {key_fields}") |
| 163 | +print(f" Expected entries: {num_expected}") |
| 164 | +print(f" Reported entries: {num_reported}") |
| 165 | +print(f" True positives: {true_positives}") |
| 166 | +print(f" False positives: {num_reported - true_positives}") |
| 167 | +print(f" False negatives: {num_expected - true_positives}") |
| 168 | +print() |
| 169 | +print(f" Precision: {precision:.3f}") |
| 170 | +print(f" Recall: {recall:.3f}") |
| 171 | +print(f" F1: {f1:.3f}") |
| 172 | +
|
| 173 | +# ── Show matched and missed entries ────────────────────────────────────── |
| 174 | +if true_positives > 0: |
| 175 | + print(f"\n=== Matched ({true_positives}) ===") |
| 176 | + for e_idx in sorted(matched_expected): |
| 177 | + print(f" [x] {dict(zip(key_fields, expected_keys[e_idx]))}") |
| 178 | +
|
| 179 | +missed = [i for i in range(num_expected) if i not in matched_expected] |
| 180 | +if missed: |
| 181 | + print(f"\n=== Missed ({len(missed)}) ===") |
| 182 | + for e_idx in missed: |
| 183 | + print(f" [ ] {dict(zip(key_fields, expected_keys[e_idx]))}") |
| 184 | +
|
| 185 | +write_reward(f1) |
| 186 | +PYEOF |
0 commit comments