Add evaluation score extraction helpers

chadvoegele · chadvoegele · commit 343fe71925da · 2026-05-20T09:44:26.000-05:00
Signed-off-by: Chad Voegele &lt;cvoegele@nvidia.com&gt;
diff --git a/.claude/skills/evaluation/recipes/tasks/aa_lcr.md b/.claude/skills/evaluation/recipes/tasks/aa_lcr.md
@@ -4,7 +4,7 @@
 
 - Task: `aa_lcr`
 - Harness: AA-LCR, chat
-- Primary metric: `pass@1 judge_correct`
+- Primary metric: `accuracy.accuracy`
 - Run time: Long
 - Samples: 3
 - Requires: `HF_TOKEN`, `JUDGE_API_KEY`
@@ -45,5 +45,28 @@ Use this inside the top-level `evaluation.tasks` list:
 AA-LCR accuracy comes from:
 
 ```text
-results.groups.aa_lcr.metrics.pass@1.scores.judge_correct.value
+results.groups.aa_lcr.metrics.accuracy.scores.accuracy.value
+results.groups.aa_lcr.metrics.accuracy.scores.accuracy.stats.stderr
+```
+
+```python
+import yaml
+
+
+def extract_aa_lcr_score(path):
+    data = yaml.safe_load(open(path))
+    scores = data["results"]["groups"]["aa_lcr"]["metrics"]["accuracy"]["scores"]
+    entry = scores["accuracy"]
+    accuracy = entry["value"] * 100
+    stderr = entry.get("stats", {}).get("stderr")
+    stderr_pp = stderr * 100 if stderr is not None else None
+
+    return {
+        "group": "aa_lcr",
+        "metric": "accuracy",
+        "score_key": "accuracy",
+        "accuracy": accuracy,
+        "stderr": stderr_pp,
+    }
+
 ```
diff --git a/.claude/skills/evaluation/recipes/tasks/aime2025.md b/.claude/skills/evaluation/recipes/tasks/aime2025.md
@@ -33,5 +33,61 @@ Use this inside the top-level `evaluation.tasks` list:
 
 ## Score Extraction
 
-Prefer the `pass@1[avg-of-N]` metric matching the configured sample/repeat
-count.
+AIME accuracy (in percentage points) comes from:
+
+```text
+results.groups.aime25.metrics."pass@1[avg-of-N]".scores.symbolic_correct.value
+```
+
+For repeated runs, report stderr as percentage points:
+
+```text
+results.groups.aime25.metrics."pass@1[avg-of-N]".scores.symbolic_correct_statistics_std_err_across_runs.value * 100
+```
+
+Prefer the `pass@1[avg-of-N]` metric matching the configured repeat count.
+If the repeat count is unknown, use the highest available `avg-of-N`.
+
+```python
+import re
+import yaml
+
+
+def avg_of(metric_name):
+    match = re.fullmatch(r"pass@1\[avg-of-(\d+)\]", metric_name)
+    return int(match.group(1)) if match else None
+
+
+def select_metric(metrics, repeats=None):
+    if repeats is not None:
+        expected = f"pass@1[avg-of-{repeats}]"
+        if expected in metrics:
+            return expected
+
+    repeated = [name for name in metrics if avg_of(name) is not None]
+    if repeated:
+        return max(repeated, key=avg_of)
+    return "pass@1"
+
+
+def extract_aime2025_score(path, repeats=None):
+    data = yaml.safe_load(open(path))
+    metrics = data["results"]["groups"]["aime25"]["metrics"]
+    metric_name = select_metric(metrics, repeats)
+    scores = metrics[metric_name]["scores"]
+
+    accuracy = scores["symbolic_correct"]["value"]
+    stderr_value = scores.get(
+        "symbolic_correct_statistics_std_err_across_runs", {}
+    ).get("value")
+    stderr = stderr_value * 100 if stderr_value is not None else None
+
+    return {
+        "group": "aime25",
+        "metric": metric_name,
+        "score_key": "symbolic_correct",
+        "accuracy": accuracy,
+        "stderr": stderr,
+    }
+
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/gpqa.md b/.claude/skills/evaluation/recipes/tasks/gpqa.md
@@ -51,7 +51,6 @@ repeat count is unknown, use the highest available `avg-of-N`.
 
 ```python
 import re
-import sys
 import yaml
 
 
@@ -92,9 +91,4 @@ def extract_gpqa_score(path, repeats=None):
         "stderr": stderr,
     }
 
-
-if __name__ == "__main__":
-    path = sys.argv[1]
-    repeats = int(sys.argv[2]) if len(sys.argv) > 2 else None
-    print(extract_gpqa_score(path, repeats))
 ```
diff --git a/.claude/skills/evaluation/recipes/tasks/ifbench.md b/.claude/skills/evaluation/recipes/tasks/ifbench.md
@@ -33,8 +33,59 @@ Use this inside the top-level `evaluation.tasks` list:
 
 ## Score Extraction
 
-IFBench accuracy comes from:
+IFBench primary AA-aligned accuracy (in percentage points) comes from:
 
 ```text
 results.groups.ifbench.metrics."pass@1[avg-of-N]".scores.prompt_loose_accuracy.value
 ```
+
+`results.yml` does **not** include a direct
+`prompt_loose_accuracy_statistics_std_err_across_runs`; the closest available
+across-run stderr is `prompt_statistics_std_err_across_runs`. It is computed
+over the strict + loose prompt-level average rather than
+`prompt_loose_accuracy` alone, so report it as an approximate uncertainty.
+
+```python
+import re
+import yaml
+
+
+def avg_of(metric_name):
+    match = re.fullmatch(r"pass@1\[avg-of-(\d+)\]", metric_name)
+    return int(match.group(1)) if match else None
+
+
+def select_metric(metrics, repeats=None):
+    if repeats is not None:
+        expected = f"pass@1[avg-of-{repeats}]"
+        if expected in metrics:
+            return expected
+
+    repeated = [name for name in metrics if avg_of(name) is not None]
+    if repeated:
+        return max(repeated, key=avg_of)
+    return "pass@1"
+
+
+def extract_ifbench_score(path, repeats=None):
+    data = yaml.safe_load(open(path))
+    metrics = data["results"]["groups"]["ifbench"]["metrics"]
+    metric_name = select_metric(metrics, repeats)
+    scores = metrics[metric_name]["scores"]
+
+    accuracy = scores["prompt_loose_accuracy"]["value"]
+    proxy_stderr_value = scores.get(
+        "prompt_statistics_std_err_across_runs", {}
+    ).get("value")
+    stderr = proxy_stderr_value * 100 if proxy_stderr_value is not None else None
+
+    return {
+        "group": "ifbench",
+        "metric": metric_name,
+        "score_key": "prompt_loose_accuracy",
+        "accuracy": accuracy,
+        "stderr": stderr,
+        "stderr_source": "prompt_statistics_std_err_across_runs (proxy)",
+    }
+
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md
@@ -33,3 +33,32 @@ Use this inside the top-level `evaluation.tasks` list:
 ```
 
 ## Score Extraction
+
+```text
+results.groups.mmlu_pro.metrics.pass@1.scores.symbolic_correct.value
+```
+
+`num_repeats: 1` is the standard setting, so `results.yml` does not include
+an across-run stderr. The score is computed over a single pass of the
+dataset (`stats.count` equals `num_problems`).
+
+```python
+import yaml
+
+
+def extract_mmlu_pro_score(path):
+    data = yaml.safe_load(open(path))
+    scores = data["results"]["groups"]["mmlu_pro"]["metrics"]["pass@1"]["scores"]
+    entry = scores["symbolic_correct"]
+    accuracy = entry["value"]
+    n = entry.get("stats", {}).get("count")
+
+    return {
+        "group": "mmlu_pro",
+        "metric": "pass@1",
+        "score_key": "symbolic_correct",
+        "accuracy": accuracy,
+        "stderr": None,
+        "n": n,
+    }
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/mmmu_pro.md b/.claude/skills/evaluation/recipes/tasks/mmmu_pro.md
@@ -30,8 +30,34 @@ Use this inside the top-level `evaluation.tasks` list:
 
 ## Score Extraction
 
-MMMU-Pro accuracy comes from:
+MMMU-Pro accuracy (already in percentage points) comes from:
 
 ```text
 results.groups."mmmu-pro".metrics.pass@1.scores.symbolic_correct.value
 ```
+
+`num_repeats: 1` is the standard setting, so `results.yml` does not include
+an across-run stderr. The score is computed over a single pass of the
+dataset (`stats.count` equals `num_problems`).
+
+```python
+import yaml
+
+
+def extract_mmmu_pro_score(path):
+    data = yaml.safe_load(open(path))
+    scores = data["results"]["groups"]["mmmu-pro"]["metrics"]["pass@1"]["scores"]
+    entry = scores["symbolic_correct"]
+    accuracy = entry["value"]
+    n = entry.get("stats", {}).get("count")
+
+    return {
+        "group": "mmmu-pro",
+        "metric": "pass@1",
+        "score_key": "symbolic_correct",
+        "accuracy": accuracy,
+        "stderr": None,
+        "n": n,
+    }
+
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/ns_hle_aa.md b/.claude/skills/evaluation/recipes/tasks/ns_hle_aa.md
@@ -42,3 +42,26 @@ HLE AA accuracy comes from:
 ```text
 results.groups.hle.metrics.pass@1.scores.judge_correct.value
 ```
+
+```python
+import yaml
+
+
+def extract_ns_hle_aa_score(path):
+    data = yaml.safe_load(open(path))
+    scores = data["results"]["groups"]["hle"]["metrics"]["pass@1"]["scores"]
+    accuracy = scores["judge_correct"]["value"]
+    symbolic = scores.get("symbolic_correct", {}).get("value")
+    n = scores["judge_correct"].get("stats", {}).get("count")
+
+    return {
+        "group": "hle",
+        "metric": "pass@1",
+        "score_key": "judge_correct",
+        "accuracy": accuracy,
+        "symbolic_correct": symbolic,
+        "stderr": None,
+        "n": n,
+    }
+
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/scicode.md b/.claude/skills/evaluation/recipes/tasks/scicode.md
@@ -86,7 +86,6 @@ The helper below also supports GPQA's matching layout, where accuracy comes from
 
 ```python
 import re
-import sys
 import yaml
 
 
@@ -143,9 +142,4 @@ def extract_score(path, group="scicode"):
         "stderr": stderr,
     }
 
-
-if __name__ == "__main__":
-    path = sys.argv[1]
-    group = sys.argv[2] if len(sys.argv) > 2 else "scicode"
-    print(extract_score(path, group))
 ```