-
Notifications
You must be signed in to change notification settings - Fork 18
Expand file tree
/
Copy pathreasoning_sweep_report.json
More file actions
114 lines (114 loc) · 2.31 KB
/
reasoning_sweep_report.json
File metadata and controls
114 lines (114 loc) · 2.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
{
"accuracy": 0.9069387755102041,
"ece": 0.2599607898363426,
"critical_gaps": [],
"recommendations": [
"INVESTIGATE: R34 deactivated in 8/25 cases",
"PRIORITY: Improve invariant detection (R14) and stress testing (R26)",
"PRIORITY: Reduce deactivation rate of R23 (contradiction) and R13 (reduction)"
],
"by_domain": {
"number_theory": {
"total": 200,
"correct": 200,
"activated": 180
},
"geometry": {
"total": 125,
"correct": 100,
"activated": 114
},
"combinatorics": {
"total": 200,
"correct": 184,
"activated": 172
},
"algebra": {
"total": 125,
"correct": 115,
"activated": 118
},
"inequality": {
"total": 100,
"correct": 84,
"activated": 95
},
"functional_equation": {
"total": 150,
"correct": 126,
"activated": 128
},
"game_theory": {
"total": 125,
"correct": 110,
"activated": 110
},
"combinatorial_geometry": {
"total": 200,
"correct": 192,
"activated": 186
}
},
"by_reasoning": {
"R08": {
"success_rate": 0.92,
"activation_rate": 0.93
},
"R10": {
"success_rate": 0.895,
"activation_rate": 0.985
},
"R12": {
"success_rate": 0.96,
"activation_rate": 0.86
},
"R14": {
"success_rate": 0.8971428571428571,
"activation_rate": 0.9885714285714285
},
"R15": {
"success_rate": 0.96,
"activation_rate": 0.8933333333333333
},
"R19": {
"success_rate": 0.94,
"activation_rate": 0.88
},
"R22": {
"success_rate": 0.92,
"activation_rate": 0.8133333333333334
},
"R23": {
"success_rate": 0.92,
"activation_rate": 0.8
},
"R04": {
"success_rate": 0.88,
"activation_rate": 0.8
},
"R17": {
"success_rate": 0.88,
"activation_rate": 0.8914285714285715
},
"R26": {
"success_rate": 0.88,
"activation_rate": 0.84
},
"R29": {
"success_rate": 0.92,
"activation_rate": 0.84
},
"R34": {
"success_rate": 0.84,
"activation_rate": 0.68
},
"R48": {
"success_rate": 0.88,
"activation_rate": 0.84
},
"R13": {
"success_rate": 0.96,
"activation_rate": 0.92
}
}
}