Skip to content

Commit 343fe71

Browse files
committed
Add evaluation score extraction helpers
Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
1 parent ffa7558 commit 343fe71

8 files changed

Lines changed: 214 additions & 18 deletions

File tree

.claude/skills/evaluation/recipes/tasks/aa_lcr.md

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
- Task: `aa_lcr`
66
- Harness: AA-LCR, chat
7-
- Primary metric: `pass@1 judge_correct`
7+
- Primary metric: `accuracy.accuracy`
88
- Run time: Long
99
- Samples: 3
1010
- Requires: `HF_TOKEN`, `JUDGE_API_KEY`
@@ -45,5 +45,28 @@ Use this inside the top-level `evaluation.tasks` list:
4545
AA-LCR accuracy comes from:
4646
4747
```text
48-
results.groups.aa_lcr.metrics.pass@1.scores.judge_correct.value
48+
results.groups.aa_lcr.metrics.accuracy.scores.accuracy.value
49+
results.groups.aa_lcr.metrics.accuracy.scores.accuracy.stats.stderr
50+
```
51+
52+
```python
53+
import yaml
54+
55+
56+
def extract_aa_lcr_score(path):
57+
data = yaml.safe_load(open(path))
58+
scores = data["results"]["groups"]["aa_lcr"]["metrics"]["accuracy"]["scores"]
59+
entry = scores["accuracy"]
60+
accuracy = entry["value"] * 100
61+
stderr = entry.get("stats", {}).get("stderr")
62+
stderr_pp = stderr * 100 if stderr is not None else None
63+
64+
return {
65+
"group": "aa_lcr",
66+
"metric": "accuracy",
67+
"score_key": "accuracy",
68+
"accuracy": accuracy,
69+
"stderr": stderr_pp,
70+
}
71+
4972
```

.claude/skills/evaluation/recipes/tasks/aime2025.md

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,5 +33,61 @@ Use this inside the top-level `evaluation.tasks` list:
3333
3434
## Score Extraction
3535
36-
Prefer the `pass@1[avg-of-N]` metric matching the configured sample/repeat
37-
count.
36+
AIME accuracy (in percentage points) comes from:
37+
38+
```text
39+
results.groups.aime25.metrics."pass@1[avg-of-N]".scores.symbolic_correct.value
40+
```
41+
42+
For repeated runs, report stderr as percentage points:
43+
44+
```text
45+
results.groups.aime25.metrics."pass@1[avg-of-N]".scores.symbolic_correct_statistics_std_err_across_runs.value * 100
46+
```
47+
48+
Prefer the `pass@1[avg-of-N]` metric matching the configured repeat count.
49+
If the repeat count is unknown, use the highest available `avg-of-N`.
50+
51+
```python
52+
import re
53+
import yaml
54+
55+
56+
def avg_of(metric_name):
57+
match = re.fullmatch(r"pass@1\[avg-of-(\d+)\]", metric_name)
58+
return int(match.group(1)) if match else None
59+
60+
61+
def select_metric(metrics, repeats=None):
62+
if repeats is not None:
63+
expected = f"pass@1[avg-of-{repeats}]"
64+
if expected in metrics:
65+
return expected
66+
67+
repeated = [name for name in metrics if avg_of(name) is not None]
68+
if repeated:
69+
return max(repeated, key=avg_of)
70+
return "pass@1"
71+
72+
73+
def extract_aime2025_score(path, repeats=None):
74+
data = yaml.safe_load(open(path))
75+
metrics = data["results"]["groups"]["aime25"]["metrics"]
76+
metric_name = select_metric(metrics, repeats)
77+
scores = metrics[metric_name]["scores"]
78+
79+
accuracy = scores["symbolic_correct"]["value"]
80+
stderr_value = scores.get(
81+
"symbolic_correct_statistics_std_err_across_runs", {}
82+
).get("value")
83+
stderr = stderr_value * 100 if stderr_value is not None else None
84+
85+
return {
86+
"group": "aime25",
87+
"metric": metric_name,
88+
"score_key": "symbolic_correct",
89+
"accuracy": accuracy,
90+
"stderr": stderr,
91+
}
92+
93+
```

.claude/skills/evaluation/recipes/tasks/gpqa.md

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ repeat count is unknown, use the highest available `avg-of-N`.
5151

5252
```python
5353
import re
54-
import sys
5554
import yaml
5655

5756

@@ -92,9 +91,4 @@ def extract_gpqa_score(path, repeats=None):
9291
"stderr": stderr,
9392
}
9493

95-
96-
if __name__ == "__main__":
97-
path = sys.argv[1]
98-
repeats = int(sys.argv[2]) if len(sys.argv) > 2 else None
99-
print(extract_gpqa_score(path, repeats))
10094
```

.claude/skills/evaluation/recipes/tasks/ifbench.md

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,59 @@ Use this inside the top-level `evaluation.tasks` list:
3333
3434
## Score Extraction
3535
36-
IFBench accuracy comes from:
36+
IFBench primary AA-aligned accuracy (in percentage points) comes from:
3737
3838
```text
3939
results.groups.ifbench.metrics."pass@1[avg-of-N]".scores.prompt_loose_accuracy.value
4040
```
41+
42+
`results.yml` does **not** include a direct
43+
`prompt_loose_accuracy_statistics_std_err_across_runs`; the closest available
44+
across-run stderr is `prompt_statistics_std_err_across_runs`. It is computed
45+
over the strict + loose prompt-level average rather than
46+
`prompt_loose_accuracy` alone, so report it as an approximate uncertainty.
47+
48+
```python
49+
import re
50+
import yaml
51+
52+
53+
def avg_of(metric_name):
54+
match = re.fullmatch(r"pass@1\[avg-of-(\d+)\]", metric_name)
55+
return int(match.group(1)) if match else None
56+
57+
58+
def select_metric(metrics, repeats=None):
59+
if repeats is not None:
60+
expected = f"pass@1[avg-of-{repeats}]"
61+
if expected in metrics:
62+
return expected
63+
64+
repeated = [name for name in metrics if avg_of(name) is not None]
65+
if repeated:
66+
return max(repeated, key=avg_of)
67+
return "pass@1"
68+
69+
70+
def extract_ifbench_score(path, repeats=None):
71+
data = yaml.safe_load(open(path))
72+
metrics = data["results"]["groups"]["ifbench"]["metrics"]
73+
metric_name = select_metric(metrics, repeats)
74+
scores = metrics[metric_name]["scores"]
75+
76+
accuracy = scores["prompt_loose_accuracy"]["value"]
77+
proxy_stderr_value = scores.get(
78+
"prompt_statistics_std_err_across_runs", {}
79+
).get("value")
80+
stderr = proxy_stderr_value * 100 if proxy_stderr_value is not None else None
81+
82+
return {
83+
"group": "ifbench",
84+
"metric": metric_name,
85+
"score_key": "prompt_loose_accuracy",
86+
"accuracy": accuracy,
87+
"stderr": stderr,
88+
"stderr_source": "prompt_statistics_std_err_across_runs (proxy)",
89+
}
90+
91+
```

.claude/skills/evaluation/recipes/tasks/mmlu_pro.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,32 @@ Use this inside the top-level `evaluation.tasks` list:
3333
```
3434
3535
## Score Extraction
36+
37+
```text
38+
results.groups.mmlu_pro.metrics.pass@1.scores.symbolic_correct.value
39+
```
40+
41+
`num_repeats: 1` is the standard setting, so `results.yml` does not include
42+
an across-run stderr. The score is computed over a single pass of the
43+
dataset (`stats.count` equals `num_problems`).
44+
45+
```python
46+
import yaml
47+
48+
49+
def extract_mmlu_pro_score(path):
50+
data = yaml.safe_load(open(path))
51+
scores = data["results"]["groups"]["mmlu_pro"]["metrics"]["pass@1"]["scores"]
52+
entry = scores["symbolic_correct"]
53+
accuracy = entry["value"]
54+
n = entry.get("stats", {}).get("count")
55+
56+
return {
57+
"group": "mmlu_pro",
58+
"metric": "pass@1",
59+
"score_key": "symbolic_correct",
60+
"accuracy": accuracy,
61+
"stderr": None,
62+
"n": n,
63+
}
64+
```

.claude/skills/evaluation/recipes/tasks/mmmu_pro.md

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,34 @@ Use this inside the top-level `evaluation.tasks` list:
3030
3131
## Score Extraction
3232
33-
MMMU-Pro accuracy comes from:
33+
MMMU-Pro accuracy (already in percentage points) comes from:
3434
3535
```text
3636
results.groups."mmmu-pro".metrics.pass@1.scores.symbolic_correct.value
3737
```
38+
39+
`num_repeats: 1` is the standard setting, so `results.yml` does not include
40+
an across-run stderr. The score is computed over a single pass of the
41+
dataset (`stats.count` equals `num_problems`).
42+
43+
```python
44+
import yaml
45+
46+
47+
def extract_mmmu_pro_score(path):
48+
data = yaml.safe_load(open(path))
49+
scores = data["results"]["groups"]["mmmu-pro"]["metrics"]["pass@1"]["scores"]
50+
entry = scores["symbolic_correct"]
51+
accuracy = entry["value"]
52+
n = entry.get("stats", {}).get("count")
53+
54+
return {
55+
"group": "mmmu-pro",
56+
"metric": "pass@1",
57+
"score_key": "symbolic_correct",
58+
"accuracy": accuracy,
59+
"stderr": None,
60+
"n": n,
61+
}
62+
63+
```

.claude/skills/evaluation/recipes/tasks/ns_hle_aa.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,26 @@ HLE AA accuracy comes from:
4242
```text
4343
results.groups.hle.metrics.pass@1.scores.judge_correct.value
4444
```
45+
46+
```python
47+
import yaml
48+
49+
50+
def extract_ns_hle_aa_score(path):
51+
data = yaml.safe_load(open(path))
52+
scores = data["results"]["groups"]["hle"]["metrics"]["pass@1"]["scores"]
53+
accuracy = scores["judge_correct"]["value"]
54+
symbolic = scores.get("symbolic_correct", {}).get("value")
55+
n = scores["judge_correct"].get("stats", {}).get("count")
56+
57+
return {
58+
"group": "hle",
59+
"metric": "pass@1",
60+
"score_key": "judge_correct",
61+
"accuracy": accuracy,
62+
"symbolic_correct": symbolic,
63+
"stderr": None,
64+
"n": n,
65+
}
66+
67+
```

.claude/skills/evaluation/recipes/tasks/scicode.md

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ The helper below also supports GPQA's matching layout, where accuracy comes from
8686

8787
```python
8888
import re
89-
import sys
9089
import yaml
9190
9291
@@ -143,9 +142,4 @@ def extract_score(path, group="scicode"):
143142
"stderr": stderr,
144143
}
145144
146-
147-
if __name__ == "__main__":
148-
path = sys.argv[1]
149-
group = sys.argv[2] if len(sys.argv) > 2 else "scicode"
150-
print(extract_score(path, group))
151145
```

0 commit comments

Comments
 (0)