Skip to content

Commit 1ad9c96

Browse files
sjarmakclaude
andcommitted
feat: per-task variance gap analysis — 26 SDLC tasks need reruns
Add scripts/variance_gap_analysis.py to scan runs/official/ MANIFEST and staging dirs, counting paired passes per individual task (not per suite). 124/150 SDLC tasks already have 3+ paired runs; 26 need 1-2 more. Generated targeted rerun configs in configs/variance_reruns/: - variance_gap_daytona.json (22 tasks, Daytona-compatible) - variance_gap_local.json (4 sweap-images tasks, local Docker) - wave1_daytona.json (6 tasks needing 2 additional passes) - wave2_daytona.json (16 tasks needing 1 additional pass) - Per-suite breakdowns: debug=2, feature=4, fix=5, test=15 - 5 suites fully covered: design, document, refactor, secure, understand Total: 72 sandbox runs to bring all 150 SDLC tasks to 3 paired passes. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 4126431 commit 1ad9c96

14 files changed

+2609
-1
lines changed

configs/variance_reruns/variance_gap_all_sdlc.json

Lines changed: 415 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
{
2+
"metadata": {
3+
"title": "Variance rerun: ccb_debug gap tasks (2 tasks, target 3 pairs)",
4+
"description": "Targeted rerun for ccb_debug tasks with < 3 paired passes. Generated by variance_gap_analysis.py.",
5+
"generated_date": "2026-03-01",
6+
"total_tasks": 2,
7+
"max_concurrency_needed": 2,
8+
"note": "Run with --concurrency 2 to fill all gaps in one batch. Or run with --concurrency 1 multiple times."
9+
},
10+
"methodology": {
11+
"sdlc_suites": [
12+
"ccb_debug"
13+
]
14+
},
15+
"statistics": {
16+
"total_tasks": 2,
17+
"per_suite": {
18+
"ccb_debug": 2
19+
}
20+
},
21+
"tasks": [
22+
{
23+
"task_id": "teleport-ssh-regression-prove-001",
24+
"benchmark": "ccb_debug",
25+
"task_dir": "ccb_debug/teleport-ssh-regression-prove-001",
26+
"language": "go",
27+
"difficulty": "hard",
28+
"current_bl_runs": 1,
29+
"current_mcp_runs": 3,
30+
"current_paired": 1,
31+
"runs_needed": 2,
32+
"sdlc_phase": "debug",
33+
"repo": "gravitational/teleport",
34+
"mcp_benefit_score": 0.75
35+
},
36+
{
37+
"task_id": "tutanota-search-regression-prove-001",
38+
"benchmark": "ccb_debug",
39+
"task_dir": "ccb_debug/tutanota-search-regression-prove-001",
40+
"language": "typescript",
41+
"difficulty": "hard",
42+
"current_bl_runs": 3,
43+
"current_mcp_runs": 1,
44+
"current_paired": 1,
45+
"runs_needed": 2,
46+
"sdlc_phase": "debug",
47+
"repo": "tutanota/tutanota",
48+
"mcp_benefit_score": 0.75
49+
}
50+
]
51+
}
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
{
2+
"metadata": {
3+
"title": "Variance rerun: ccb_feature gap tasks (4 tasks, target 3 pairs)",
4+
"description": "Targeted rerun for ccb_feature tasks with < 3 paired passes. Generated by variance_gap_analysis.py.",
5+
"generated_date": "2026-03-01",
6+
"total_tasks": 4,
7+
"max_concurrency_needed": 2,
8+
"note": "Run with --concurrency 2 to fill all gaps in one batch. Or run with --concurrency 1 multiple times."
9+
},
10+
"methodology": {
11+
"sdlc_suites": [
12+
"ccb_feature"
13+
]
14+
},
15+
"statistics": {
16+
"total_tasks": 4,
17+
"per_suite": {
18+
"ccb_feature": 4
19+
}
20+
},
21+
"tasks": [
22+
{
23+
"task_id": "k8s-runtime-object-impl-001",
24+
"benchmark": "ccb_feature",
25+
"task_dir": "ccb_feature/k8s-runtime-object-impl-001",
26+
"language": "go",
27+
"difficulty": "hard",
28+
"current_bl_runs": 3,
29+
"current_mcp_runs": 2,
30+
"current_paired": 2,
31+
"runs_needed": 1,
32+
"sdlc_phase": "Requirements & Discovery",
33+
"category": "interface_implementation",
34+
"repo": "kubernetes/kubernetes",
35+
"mcp_benefit_score": 0.88
36+
},
37+
{
38+
"task_id": "postgres-copy-csv-header-feat-001",
39+
"benchmark": "ccb_feature",
40+
"task_dir": "ccb_feature/postgres-copy-csv-header-feat-001",
41+
"language": "c",
42+
"difficulty": "expert",
43+
"current_bl_runs": 1,
44+
"current_mcp_runs": 1,
45+
"current_paired": 1,
46+
"runs_needed": 2,
47+
"sdlc_phase": "Implementation (feature)",
48+
"category": "feature_implementation",
49+
"repo": "postgres/postgres",
50+
"mcp_benefit_score": 0.88
51+
},
52+
{
53+
"task_id": "servo-css-container-query-feat-001",
54+
"benchmark": "ccb_feature",
55+
"task_dir": "ccb_feature/servo-css-container-query-feat-001",
56+
"language": "rust",
57+
"difficulty": "expert",
58+
"current_bl_runs": 1,
59+
"current_mcp_runs": 1,
60+
"current_paired": 1,
61+
"runs_needed": 2,
62+
"sdlc_phase": "Implementation (feature)",
63+
"category": "feature_implementation",
64+
"repo": "servo/servo",
65+
"mcp_benefit_score": 0.89
66+
},
67+
{
68+
"task_id": "vscode-custom-fold-region-feat-001",
69+
"benchmark": "ccb_feature",
70+
"task_dir": "ccb_feature/vscode-custom-fold-region-feat-001",
71+
"language": "typescript",
72+
"difficulty": "hard",
73+
"current_bl_runs": 1,
74+
"current_mcp_runs": 1,
75+
"current_paired": 1,
76+
"runs_needed": 2,
77+
"sdlc_phase": "Implementation (feature)",
78+
"category": "feature_implementation",
79+
"repo": "microsoft/vscode",
80+
"mcp_benefit_score": 0.87
81+
}
82+
]
83+
}
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
{
2+
"metadata": {
3+
"title": "Variance rerun: ccb_fix gap tasks (5 tasks, target 3 pairs)",
4+
"description": "Targeted rerun for ccb_fix tasks with < 3 paired passes. Generated by variance_gap_analysis.py.",
5+
"generated_date": "2026-03-01",
6+
"total_tasks": 5,
7+
"max_concurrency_needed": 1,
8+
"note": "Run with --concurrency 1 to fill all gaps in one batch. Or run with --concurrency 1 multiple times."
9+
},
10+
"methodology": {
11+
"sdlc_suites": [
12+
"ccb_fix"
13+
]
14+
},
15+
"statistics": {
16+
"total_tasks": 5,
17+
"per_suite": {
18+
"ccb_fix": 5
19+
}
20+
},
21+
"tasks": [
22+
{
23+
"task_id": "flink-window-late-data-fix-001",
24+
"benchmark": "ccb_fix",
25+
"task_dir": "ccb_fix/flink-window-late-data-fix-001",
26+
"language": "java",
27+
"difficulty": "hard",
28+
"current_bl_runs": 2,
29+
"current_mcp_runs": 2,
30+
"current_paired": 2,
31+
"runs_needed": 1,
32+
"sdlc_phase": "Implementation (bug fix)",
33+
"category": "bug_fix",
34+
"repo": "apache/flink",
35+
"mcp_benefit_score": 0.87
36+
},
37+
{
38+
"task_id": "nodebb-plugin-validate-fix-001",
39+
"benchmark": "ccb_fix",
40+
"task_dir": "ccb_fix/nodebb-plugin-validate-fix-001",
41+
"language": "javascript",
42+
"difficulty": "medium",
43+
"current_bl_runs": 2,
44+
"current_mcp_runs": 4,
45+
"current_paired": 2,
46+
"runs_needed": 1,
47+
"sdlc_phase": "fix",
48+
"repo": "NodeBB/NodeBB",
49+
"mcp_benefit_score": 0.75
50+
},
51+
{
52+
"task_id": "openlibrary-fntocli-adapter-fix-001",
53+
"benchmark": "ccb_fix",
54+
"task_dir": "ccb_fix/openlibrary-fntocli-adapter-fix-001",
55+
"language": "python",
56+
"difficulty": "hard",
57+
"current_bl_runs": 2,
58+
"current_mcp_runs": 2,
59+
"current_paired": 2,
60+
"runs_needed": 1,
61+
"sdlc_phase": "Implementation (bug fix)",
62+
"category": "fix",
63+
"repo": "internetarchive/openlibrary",
64+
"mcp_benefit_score": 0.85
65+
},
66+
{
67+
"task_id": "pytorch-release-210-fix-001",
68+
"benchmark": "ccb_fix",
69+
"task_dir": "ccb_fix/pytorch-release-210-fix-001",
70+
"language": "cpp",
71+
"difficulty": "hard",
72+
"current_bl_runs": 2,
73+
"current_mcp_runs": 2,
74+
"current_paired": 2,
75+
"runs_needed": 1,
76+
"sdlc_phase": "Implementation (bug fix)",
77+
"category": "fix",
78+
"repo": "pytorch",
79+
"mcp_benefit_score": 0.85
80+
},
81+
{
82+
"task_id": "pytorch-relu-gelu-fusion-fix-001",
83+
"benchmark": "ccb_fix",
84+
"task_dir": "ccb_fix/pytorch-relu-gelu-fusion-fix-001",
85+
"language": "cpp",
86+
"difficulty": "hard",
87+
"current_bl_runs": 2,
88+
"current_mcp_runs": 2,
89+
"current_paired": 2,
90+
"runs_needed": 1,
91+
"sdlc_phase": "Implementation (bug fix)",
92+
"category": "fix",
93+
"repo": "pytorch/pytorch",
94+
"mcp_benefit_score": 0.85
95+
}
96+
]
97+
}

0 commit comments

Comments
 (0)