@@ -33,8 +33,59 @@ Use this inside the top-level `evaluation.tasks` list:
3333
3434## Score Extraction
3535
36- IFBench accuracy comes from:
36+ IFBench primary AA-aligned accuracy (in percentage points) comes from:
3737
3838` ` ` text
3939results.groups.ifbench.metrics."pass@1[avg-of-N]".scores.prompt_loose_accuracy.value
4040```
41+
42+ ` results.yml ` does ** not** include a direct
43+ ` prompt_loose_accuracy_statistics_std_err_across_runs ` ; the closest available
44+ across-run stderr is ` prompt_statistics_std_err_across_runs ` . It is computed
45+ over the strict + loose prompt-level average rather than
46+ ` prompt_loose_accuracy ` alone, so report it as an approximate uncertainty.
47+
48+ ``` python
49+ import re
50+ import yaml
51+
52+
53+ def avg_of (metric_name ):
54+ match = re.fullmatch(r " pass@1\[ avg-of-( \d + ) \] " , metric_name)
55+ return int (match.group(1 )) if match else None
56+
57+
58+ def select_metric (metrics , repeats = None ):
59+ if repeats is not None :
60+ expected = f " pass@1[avg-of- { repeats} ] "
61+ if expected in metrics:
62+ return expected
63+
64+ repeated = [name for name in metrics if avg_of(name) is not None ]
65+ if repeated:
66+ return max (repeated, key = avg_of)
67+ return " pass@1"
68+
69+
70+ def extract_ifbench_score (path , repeats = None ):
71+ data = yaml.safe_load(open (path))
72+ metrics = data[" results" ][" groups" ][" ifbench" ][" metrics" ]
73+ metric_name = select_metric(metrics, repeats)
74+ scores = metrics[metric_name][" scores" ]
75+
76+ accuracy = scores[" prompt_loose_accuracy" ][" value" ]
77+ proxy_stderr_value = scores.get(
78+ " prompt_statistics_std_err_across_runs" , {}
79+ ).get(" value" )
80+ stderr = proxy_stderr_value * 100 if proxy_stderr_value is not None else None
81+
82+ return {
83+ " group" : " ifbench" ,
84+ " metric" : metric_name,
85+ " score_key" : " prompt_loose_accuracy" ,
86+ " accuracy" : accuracy,
87+ " stderr" : stderr,
88+ " stderr_source" : " prompt_statistics_std_err_across_runs (proxy)" ,
89+ }
90+
91+ ```
0 commit comments