Skip to content

Commit f53d578

Browse files
committed
Harden scaffold validation and rerun sentinels
1 parent e5a6650 commit f53d578

16 files changed

+352
-41
lines changed

.github/workflows/task_smoke_matrix.yml

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ on:
88
- "benchmarks/**"
99
- "configs/validate_one_per_benchmark.sh"
1010
- "configs/registry_smoke_matrix.json"
11+
- "configs/registry_smoke_artifact_matrix.json"
1112
- "scripts/validate_tasks_preflight.py"
1213
- "docs/reference/TASK_CONTRACT.md"
1314
- ".github/workflows/task_smoke_matrix.yml"
@@ -17,6 +18,7 @@ on:
1718
- "benchmarks/**"
1819
- "configs/validate_one_per_benchmark.sh"
1920
- "configs/registry_smoke_matrix.json"
21+
- "configs/registry_smoke_artifact_matrix.json"
2022
- "scripts/validate_tasks_preflight.py"
2123
- "docs/reference/TASK_CONTRACT.md"
2224
- ".github/workflows/task_smoke_matrix.yml"
@@ -54,10 +56,13 @@ jobs:
5456
strategy:
5557
fail-fast: false
5658
matrix:
57-
variant:
58-
- baseline
59-
- sg-only
60-
- artifact-only
59+
include:
60+
- variant: baseline
61+
selection_file: configs/registry_smoke_matrix.json
62+
- variant: sg-only
63+
selection_file: configs/registry_smoke_matrix.json
64+
- variant: artifact-only
65+
selection_file: configs/registry_smoke_artifact_matrix.json
6166
steps:
6267
- uses: actions/checkout@v4
6368

@@ -76,7 +81,8 @@ jobs:
7681
artifact-only) extra_args+=(--artifact-only) ;;
7782
esac
7883
bash configs/validate_one_per_benchmark.sh \
79-
--selection-file configs/registry_smoke_matrix.json \
84+
--selection-file "${{ matrix.selection_file }}" \
85+
--exact-selection \
8086
--smoke-runtime \
8187
--smoke-timeout-sec 300 \
8288
--smoke-timeout-overrides "csb_sdlc_design=450,csb_sdlc_document=450,csb_sdlc_feature=600,csb_sdlc_fix=600,csb_sdlc_refactor=600,csb_sdlc_test=450,csb_sdlc_understand=450" \

benchmarks/csb_sdlc_understand/clickhouse-mergetree-arch-understand-001/task.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ description = "Checks task completion: file existence, content validation, patte
2525
build_timeout_sec = 1200.0
2626
cpus = 4
2727
memory = "8G"
28-
storage = "20G"
28+
storage = "10G"
2929

3030
[environment.setup_scripts]
3131
mcp_config = '''
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
{
2+
"metadata": {
3+
"name": "claude_historical_failure_rerun_mcp_20260309",
4+
"description": "Focused Claude MCP rerun manifest for historical task-contract and harness sentinels.",
5+
"source_bead": "CodeScaleBench-2kz",
6+
"generated_date": "2026-03-09",
7+
"run_mode": "mcp-only"
8+
},
9+
"tasks": [
10+
{
11+
"task_id": "ccx-onboard-search-207",
12+
"benchmark": "csb_org_onboarding",
13+
"task_dir": "csb_org_onboarding/ccx-onboard-search-207",
14+
"language": "cpp",
15+
"difficulty": "hard",
16+
"repo": "mozilla/gecko-dev"
17+
},
18+
{
19+
"task_id": "ccx-onboard-search-208",
20+
"benchmark": "csb_org_onboarding",
21+
"task_dir": "csb_org_onboarding/ccx-onboard-search-208",
22+
"language": "cpp",
23+
"difficulty": "hard",
24+
"repo": "mozilla/gecko-dev"
25+
},
26+
{
27+
"task_id": "ccx-onboard-search-210",
28+
"benchmark": "csb_org_onboarding",
29+
"task_dir": "csb_org_onboarding/ccx-onboard-search-210",
30+
"language": "cpp",
31+
"difficulty": "hard",
32+
"repo": "envoyproxy/envoy"
33+
},
34+
{
35+
"task_id": "bustub-hyperloglog-impl-001",
36+
"benchmark": "csb_sdlc_feature",
37+
"task_dir": "csb_sdlc_feature/bustub-hyperloglog-impl-001",
38+
"language": "cpp",
39+
"difficulty": "hard",
40+
"repo": "cmu-db/bustub"
41+
},
42+
{
43+
"task_id": "django-sensitive-file-exclusion-001",
44+
"benchmark": "csb_sdlc_secure",
45+
"task_dir": "csb_sdlc_secure/django-sensitive-file-exclusion-001",
46+
"language": "python",
47+
"difficulty": "hard",
48+
"repo": "django/django"
49+
},
50+
{
51+
"task_id": "flink-window-late-data-fix-001",
52+
"benchmark": "csb_sdlc_fix",
53+
"task_dir": "csb_sdlc_fix/flink-window-late-data-fix-001",
54+
"language": "java",
55+
"difficulty": "hard",
56+
"repo": "apache/flink"
57+
},
58+
{
59+
"task_id": "element-web-unread-indicators-diverge-fix-001",
60+
"benchmark": "csb_sdlc_fix",
61+
"task_dir": "csb_sdlc_fix/element-web-unread-indicators-diverge-fix-001",
62+
"language": "typescript",
63+
"difficulty": "hard",
64+
"repo": "element-hq/element-web"
65+
},
66+
{
67+
"task_id": "clickhouse-mergetree-arch-understand-001",
68+
"benchmark": "csb_sdlc_understand",
69+
"task_dir": "csb_sdlc_understand/clickhouse-mergetree-arch-understand-001",
70+
"language": "cpp",
71+
"difficulty": "hard",
72+
"repo": "ClickHouse/ClickHouse"
73+
}
74+
]
75+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"metadata": {
3+
"name": "openhands_historical_failure_rerun_baseline_20260309",
4+
"description": "Focused OpenHands baseline rerun manifest for the remaining historical harness sentinel.",
5+
"source_bead": "CodeScaleBench-2kz",
6+
"generated_date": "2026-03-09",
7+
"run_mode": "baseline-only"
8+
},
9+
"tasks": [
10+
{
11+
"task_id": "ccx-onboard-search-212",
12+
"benchmark": "csb_org_onboarding",
13+
"task_dir": "csb_org_onboarding/ccx-onboard-search-212",
14+
"language": "python",
15+
"difficulty": "hard",
16+
"repo": "pandas-dev/pandas"
17+
}
18+
]
19+
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
{
2+
"metadata": {
3+
"description": "Curated artifact_only smoke matrix for registry-ready task validation on build-requiring and historically fragile artifact variants.",
4+
"purpose": "Catch artifact transport and verifier-environment regressions without running a full benchmark batch."
5+
},
6+
"tasks": [
7+
{
8+
"task_id": "ccx-compliance-124",
9+
"benchmark": "csb_org_compliance",
10+
"task_dir": "csb_org_compliance/ccx-compliance-124"
11+
},
12+
{
13+
"task_id": "ccx-onboard-search-207",
14+
"benchmark": "csb_org_onboarding",
15+
"task_dir": "csb_org_onboarding/ccx-onboard-search-207"
16+
},
17+
{
18+
"task_id": "ccx-onboard-search-212",
19+
"benchmark": "csb_org_onboarding",
20+
"task_dir": "csb_org_onboarding/ccx-onboard-search-212"
21+
},
22+
{
23+
"task_id": "envoy-grpc-server-impl-001",
24+
"benchmark": "csb_sdlc_feature",
25+
"task_dir": "csb_sdlc_feature/envoy-grpc-server-impl-001"
26+
},
27+
{
28+
"task_id": "bustub-hyperloglog-impl-001",
29+
"benchmark": "csb_sdlc_feature",
30+
"task_dir": "csb_sdlc_feature/bustub-hyperloglog-impl-001"
31+
},
32+
{
33+
"task_id": "nodebb-plugin-validate-fix-001",
34+
"benchmark": "csb_sdlc_fix",
35+
"task_dir": "csb_sdlc_fix/nodebb-plugin-validate-fix-001"
36+
},
37+
{
38+
"task_id": "django-sensitive-file-exclusion-001",
39+
"benchmark": "csb_sdlc_secure",
40+
"task_dir": "csb_sdlc_secure/django-sensitive-file-exclusion-001"
41+
},
42+
{
43+
"task_id": "aspnetcore-code-review-001",
44+
"benchmark": "csb_sdlc_test",
45+
"task_dir": "csb_sdlc_test/aspnetcore-code-review-001"
46+
}
47+
]
48+
}

configs/registry_smoke_matrix.json

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"metadata": {
3-
"description": "Curated local-Docker smoke matrix for registry-ready task validation across baseline, sg_only, and artifact_only variants.",
3+
"description": "Curated local-Docker smoke matrix for registry-ready task validation across baseline and sg_only variants, including regression sentinel tasks from historically fragile families.",
44
"purpose": "Catch task contract drift and common harness portability regressions without running a full benchmark batch."
55
},
66
"tasks": [
@@ -14,6 +14,11 @@
1414
"benchmark": "csb_org_onboarding",
1515
"task_dir": "csb_org_onboarding/ccx-onboard-search-207"
1616
},
17+
{
18+
"task_id": "ccx-onboard-search-212",
19+
"benchmark": "csb_org_onboarding",
20+
"task_dir": "csb_org_onboarding/ccx-onboard-search-212"
21+
},
1722
{
1823
"task_id": "ansible-galaxy-tar-regression-prove-001",
1924
"benchmark": "csb_sdlc_debug",
@@ -34,11 +39,26 @@
3439
"benchmark": "csb_sdlc_feature",
3540
"task_dir": "csb_sdlc_feature/envoy-grpc-server-impl-001"
3641
},
42+
{
43+
"task_id": "bustub-hyperloglog-impl-001",
44+
"benchmark": "csb_sdlc_feature",
45+
"task_dir": "csb_sdlc_feature/bustub-hyperloglog-impl-001"
46+
},
3747
{
3848
"task_id": "flink-window-late-data-fix-001",
3949
"benchmark": "csb_sdlc_fix",
4050
"task_dir": "csb_sdlc_fix/flink-window-late-data-fix-001"
4151
},
52+
{
53+
"task_id": "element-web-unread-indicators-diverge-fix-001",
54+
"benchmark": "csb_sdlc_fix",
55+
"task_dir": "csb_sdlc_fix/element-web-unread-indicators-diverge-fix-001"
56+
},
57+
{
58+
"task_id": "nodebb-plugin-validate-fix-001",
59+
"benchmark": "csb_sdlc_fix",
60+
"task_dir": "csb_sdlc_fix/nodebb-plugin-validate-fix-001"
61+
},
4262
{
4363
"task_id": "django-request-factory-refac-001",
4464
"benchmark": "csb_sdlc_refactor",
@@ -58,6 +78,11 @@
5878
"task_id": "numpy-dtype-localize-001",
5979
"benchmark": "csb_sdlc_understand",
6080
"task_dir": "csb_sdlc_understand/numpy-dtype-localize-001"
81+
},
82+
{
83+
"task_id": "clickhouse-mergetree-arch-understand-001",
84+
"benchmark": "csb_sdlc_understand",
85+
"task_dir": "csb_sdlc_understand/clickhouse-mergetree-arch-understand-001"
6186
}
6287
]
6388
}

configs/selected_benchmark_tasks.json

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7099,8 +7099,6 @@
70997099
"task_complexity": 0.15,
71007100
"task_complexity_label": "medium",
71017101
"task_complexity_source": "ground_truth_meta_plus_registry",
7102-
"execution_env": "local_docker_only",
7103-
"daytona_incompatible_reason": "repo_too_large_for_10gb_sandbox",
71047102
"repo_approx_loc_source": "cloc",
71057103
"repo_cloc_total_files": 24074,
71067104
"repo_cloc_top_languages": [

configs/validate_one_per_benchmark.sh

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
# bash configs/validate_one_per_benchmark.sh --smoke-runtime [--smoke-timeout-sec 300] [--dry-run]
1111
# bash configs/validate_one_per_benchmark.sh --sg-only [--smoke-timeout-sec 600] [--dry-run]
1212
# bash configs/validate_one_per_benchmark.sh --artifact-only [--smoke-timeout-sec 600] [--dry-run]
13+
# bash configs/validate_one_per_benchmark.sh --selection-file configs/registry_smoke_matrix.json --exact-selection --smoke-runtime
1314
#
1415
# --sg-only: swaps Dockerfile -> Dockerfile.sg_only before each smoke, then restores.
1516
# Implies --smoke-runtime. Tests that sg_only_env images build and verify.
@@ -38,6 +39,7 @@ ARTIFACT_ONLY=false
3839
SMOKE_TIMEOUT_SEC=300
3940
SMOKE_TIMEOUT_OVERRIDES="${SMOKE_TIMEOUT_OVERRIDES:-}"
4041
MAX_CONCURRENT=0
42+
EXACT_SELECTION=false
4143

4244
while [[ $# -gt 0 ]]; do
4345
case "$1" in
@@ -75,6 +77,10 @@ while [[ $# -gt 0 ]]; do
7577
MAX_CONCURRENT="${2:-0}"
7678
shift 2
7779
;;
80+
--exact-selection)
81+
EXACT_SELECTION=true
82+
shift
83+
;;
7884
*)
7985
echo "Unknown option: $1"
8086
exit 1
@@ -152,7 +158,20 @@ wait_for_slot() {
152158
done
153159
}
154160

155-
# Extract first task per benchmark into arrays
161+
# Extract task selection into arrays
162+
if [ "$EXACT_SELECTION" = true ]; then
163+
readarray -t TASK_LINES < <(python3 -c "
164+
import json
165+
sel = json.load(open('$SELECTION_FILE'))
166+
archived = set('$ARCHIVED_SUITES'.split())
167+
tasks = sel['tasks'] if isinstance(sel, dict) and 'tasks' in sel else sel
168+
for t in tasks:
169+
bm = t['benchmark']
170+
if bm in archived:
171+
continue
172+
print(f'{bm}\tbenchmarks/{t[\"task_dir\"]}')
173+
")
174+
else
156175
readarray -t TASK_LINES < <(python3 -c "
157176
import json
158177
sel = json.load(open('$SELECTION_FILE'))
@@ -167,6 +186,7 @@ for t in tasks:
167186
seen.add(bm)
168187
print(f'{bm}\tbenchmarks/{t[\"task_dir\"]}')
169188
")
189+
fi
170190

171191
echo "=============================================="
172192
echo "CodeScaleBench Validation Run (parallel)"
@@ -185,11 +205,16 @@ else
185205
echo "Model: $MODEL"
186206
fi
187207
echo "Selection:$SELECTION_FILE"
208+
if [ "$EXACT_SELECTION" = true ]; then
209+
echo "ModeSel: exact curated tasks"
210+
else
211+
echo "ModeSel: first task per benchmark"
212+
fi
188213
if [ "$MAX_CONCURRENT" -gt 0 ]; then
189214
echo "Parallel: up to ${MAX_CONCURRENT} tasks at once"
190-
echo "Tasks: 1 per benchmark (${#TASK_LINES[@]} total, throttled)"
215+
echo "Tasks: ${#TASK_LINES[@]} total, throttled"
191216
else
192-
echo "Tasks: 1 per benchmark (${#TASK_LINES[@]} total, all concurrent)"
217+
echo "Tasks: ${#TASK_LINES[@]} total, all concurrent"
193218
fi
194219
echo "Output: $JOBS_DIR"
195220
echo ""

docs/ops/QA_PROCESS.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,17 +63,20 @@ bash configs/validate_one_per_benchmark.sh --smoke-runtime --smoke-timeout-sec 3
6363
# Curated registry smoke for CI/local verification (baseline variant)
6464
bash configs/validate_one_per_benchmark.sh \
6565
--selection-file configs/registry_smoke_matrix.json \
66+
--exact-selection \
6667
--smoke-runtime \
6768
--max-concurrent 2
6869

6970
# Variant-specific smoke
7071
bash configs/validate_one_per_benchmark.sh \
7172
--selection-file configs/registry_smoke_matrix.json \
73+
--exact-selection \
7274
--sg-only \
7375
--max-concurrent 2
7476

7577
bash configs/validate_one_per_benchmark.sh \
76-
--selection-file configs/registry_smoke_matrix.json \
78+
--selection-file configs/registry_smoke_artifact_matrix.json \
79+
--exact-selection \
7780
--artifact-only \
7881
--max-concurrent 2
7982
```

docs/reference/TASK_CONTRACT.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,8 @@ Before publishing a task, confirm:
162162
- `task.toml` storage is justified and not inflated by default
163163
- `python3 scripts/validate_tasks_preflight.py --task <task_dir>` passes
164164
- new or changed tasks also pass `--smoke-runtime` before large batch use
165+
- curated smoke coverage includes regression sentinels for task families that
166+
previously needed harness or variant fixes
165167

166168
## CI / Validation Guidance
167169

0 commit comments

Comments
 (0)