-
Notifications
You must be signed in to change notification settings - Fork 18
Expand file tree
/
Copy pathcross_validation_report.json
More file actions
89 lines (89 loc) · 1.88 KB
/
cross_validation_report.json
File metadata and controls
89 lines (89 loc) · 1.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
{
"sample_size": 12,
"correctness": {
"old": "3/12 (25%)",
"new": "12/12 (100%)",
"improvement": "+9 problems"
},
"score_improvement": {
"mean": 27.3,
"median": 24,
"min": 19,
"max": 51,
"old_avg": 53.4,
"new_avg": 80.8
},
"statistical_tests": {
"wilcoxon": {
"statistic": 78.0,
"p_value": "2.44e-04",
"significant_at_001": "True",
"interpretation": "Highly significant (p < 0.001) \u2014 strong evidence of improvement"
},
"cohens_d": {
"value": 3.05,
"interpretation": "Very large effect \u2014 dramatic improvement",
"magnitude": "very large"
},
"mcnemar": {
"statistic": 7.1111,
"p_value": "0.0077"
}
},
"confidence_calibration": {
"ece_old": 0.3692,
"ece_new": 0.1925,
"ece_improvement": 0.1767,
"interpretation": "Lower ECE = better calibration"
},
"domain_breakdown": {
"combinatorial_geometry": {
"avg_old": 34.0,
"avg_new": 85.0,
"improvement": 51.0,
"problems": 1
},
"number_theory": {
"avg_old": 60.4,
"avg_new": 86.8,
"improvement": 26.4,
"problems": 5
},
"functional_equation": {
"avg_old": 57.5,
"avg_new": 79.0,
"improvement": 21.5,
"problems": 2
},
"geometry": {
"avg_old": 50.0,
"avg_new": 75.0,
"improvement": 25.0,
"problems": 1
},
"combinatorics": {
"avg_old": 48.0,
"avg_new": 72.0,
"improvement": 24.0,
"problems": 1
},
"game_theory": {
"avg_old": 40.0,
"avg_new": 70.0,
"improvement": 30.0,
"problems": 1
},
"inequality": {
"avg_old": 52.0,
"avg_new": 75.0,
"improvement": 23.0,
"problems": 1
}
},
"failure_analysis": {
"old_failures": 9,
"new_failures": 0,
"failures_resolved": 9,
"persistent_failures": []
}
}