sourcegraph
diff --git a/‎.github/workflows/task_smoke_matrix.yml‎
Lines changed: 11 additions & 5 deletions b/‎.github/workflows/task_smoke_matrix.yml‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎benchmarks/csb_sdlc_understand/clickhouse-mergetree-arch-understand-001/task.toml‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/csb_sdlc_understand/clickhouse-mergetree-arch-understand-001/task.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎configs/claude_historical_failure_rerun_mcp_20260309.json‎
Lines changed: 75 additions & 0 deletions b/‎configs/claude_historical_failure_rerun_mcp_20260309.json‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎configs/openhands_historical_failure_rerun_baseline_20260309.json‎
Lines changed: 19 additions & 0 deletions b/‎configs/openhands_historical_failure_rerun_baseline_20260309.json‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎configs/registry_smoke_artifact_matrix.json‎
Lines changed: 48 additions & 0 deletions b/‎configs/registry_smoke_artifact_matrix.json‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎configs/registry_smoke_matrix.json‎
Lines changed: 26 additions & 1 deletion b/‎configs/registry_smoke_matrix.json‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎configs/selected_benchmark_tasks.json‎
Lines changed: 0 additions & 2 deletions b/‎configs/selected_benchmark_tasks.json‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎configs/validate_one_per_benchmark.sh‎
Lines changed: 28 additions & 3 deletions b/‎configs/validate_one_per_benchmark.sh‎
Lines changed: 28 additions & 3 deletions
diff --git a/‎docs/ops/QA_PROCESS.md‎
Lines changed: 4 additions & 1 deletion b/‎docs/ops/QA_PROCESS.md‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎docs/reference/TASK_CONTRACT.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/reference/TASK_CONTRACT.md‎
Lines changed: 2 additions & 0 deletions
@@ -8,6 +8,7 @@ on:
       - "benchmarks/**"
       - "configs/validate_one_per_benchmark.sh"
       - "configs/registry_smoke_matrix.json"
+      - "configs/registry_smoke_artifact_matrix.json"
       - "scripts/validate_tasks_preflight.py"
       - "docs/reference/TASK_CONTRACT.md"
       - ".github/workflows/task_smoke_matrix.yml"
@@ -17,6 +18,7 @@ on:
       - "benchmarks/**"
       - "configs/validate_one_per_benchmark.sh"
       - "configs/registry_smoke_matrix.json"
+      - "configs/registry_smoke_artifact_matrix.json"
       - "scripts/validate_tasks_preflight.py"
       - "docs/reference/TASK_CONTRACT.md"
       - ".github/workflows/task_smoke_matrix.yml"
@@ -54,10 +56,13 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        variant:
-          - baseline
-          - sg-only
-          - artifact-only
+        include:
+          - variant: baseline
+            selection_file: configs/registry_smoke_matrix.json
+          - variant: sg-only
+            selection_file: configs/registry_smoke_matrix.json
+          - variant: artifact-only
+            selection_file: configs/registry_smoke_artifact_matrix.json
     steps:
       - uses: actions/checkout@v4
 
@@ -76,7 +81,8 @@ jobs:
             artifact-only) extra_args+=(--artifact-only) ;;
           esac
           bash configs/validate_one_per_benchmark.sh \
-            --selection-file configs/registry_smoke_matrix.json \
+            --selection-file "${{ matrix.selection_file }}" \
+            --exact-selection \
             --smoke-runtime \
             --smoke-timeout-sec 300 \
             --smoke-timeout-overrides "csb_sdlc_design=450,csb_sdlc_document=450,csb_sdlc_feature=600,csb_sdlc_fix=600,csb_sdlc_refactor=600,csb_sdlc_test=450,csb_sdlc_understand=450" \
 
@@ -25,7 +25,7 @@ description = "Checks task completion: file existence, content validation, patte
 build_timeout_sec = 1200.0
 cpus = 4
 memory = "8G"
-storage = "20G"
+storage = "10G"
 
 [environment.setup_scripts]
 mcp_config = '''
 
@@ -0,0 +1,75 @@
+{
+  "metadata": {
+    "name": "claude_historical_failure_rerun_mcp_20260309",
+    "description": "Focused Claude MCP rerun manifest for historical task-contract and harness sentinels.",
+    "source_bead": "CodeScaleBench-2kz",
+    "generated_date": "2026-03-09",
+    "run_mode": "mcp-only"
+  },
+  "tasks": [
+    {
+      "task_id": "ccx-onboard-search-207",
+      "benchmark": "csb_org_onboarding",
+      "task_dir": "csb_org_onboarding/ccx-onboard-search-207",
+      "language": "cpp",
+      "difficulty": "hard",
+      "repo": "mozilla/gecko-dev"
+    },
+    {
+      "task_id": "ccx-onboard-search-208",
+      "benchmark": "csb_org_onboarding",
+      "task_dir": "csb_org_onboarding/ccx-onboard-search-208",
+      "language": "cpp",
+      "difficulty": "hard",
+      "repo": "mozilla/gecko-dev"
+    },
+    {
+      "task_id": "ccx-onboard-search-210",
+      "benchmark": "csb_org_onboarding",
+      "task_dir": "csb_org_onboarding/ccx-onboard-search-210",
+      "language": "cpp",
+      "difficulty": "hard",
+      "repo": "envoyproxy/envoy"
+    },
+    {
+      "task_id": "bustub-hyperloglog-impl-001",
+      "benchmark": "csb_sdlc_feature",
+      "task_dir": "csb_sdlc_feature/bustub-hyperloglog-impl-001",
+      "language": "cpp",
+      "difficulty": "hard",
+      "repo": "cmu-db/bustub"
+    },
+    {
+      "task_id": "django-sensitive-file-exclusion-001",
+      "benchmark": "csb_sdlc_secure",
+      "task_dir": "csb_sdlc_secure/django-sensitive-file-exclusion-001",
+      "language": "python",
+      "difficulty": "hard",
+      "repo": "django/django"
+    },
+    {
+      "task_id": "flink-window-late-data-fix-001",
+      "benchmark": "csb_sdlc_fix",
+      "task_dir": "csb_sdlc_fix/flink-window-late-data-fix-001",
+      "language": "java",
+      "difficulty": "hard",
+      "repo": "apache/flink"
+    },
+    {
+      "task_id": "element-web-unread-indicators-diverge-fix-001",
+      "benchmark": "csb_sdlc_fix",
+      "task_dir": "csb_sdlc_fix/element-web-unread-indicators-diverge-fix-001",
+      "language": "typescript",
+      "difficulty": "hard",
+      "repo": "element-hq/element-web"
+    },
+    {
+      "task_id": "clickhouse-mergetree-arch-understand-001",
+      "benchmark": "csb_sdlc_understand",
+      "task_dir": "csb_sdlc_understand/clickhouse-mergetree-arch-understand-001",
+      "language": "cpp",
+      "difficulty": "hard",
+      "repo": "ClickHouse/ClickHouse"
+    }
+  ]
+}
@@ -0,0 +1,19 @@
+{
+  "metadata": {
+    "name": "openhands_historical_failure_rerun_baseline_20260309",
+    "description": "Focused OpenHands baseline rerun manifest for the remaining historical harness sentinel.",
+    "source_bead": "CodeScaleBench-2kz",
+    "generated_date": "2026-03-09",
+    "run_mode": "baseline-only"
+  },
+  "tasks": [
+    {
+      "task_id": "ccx-onboard-search-212",
+      "benchmark": "csb_org_onboarding",
+      "task_dir": "csb_org_onboarding/ccx-onboard-search-212",
+      "language": "python",
+      "difficulty": "hard",
+      "repo": "pandas-dev/pandas"
+    }
+  ]
+}
@@ -0,0 +1,48 @@
+{
+  "metadata": {
+    "description": "Curated artifact_only smoke matrix for registry-ready task validation on build-requiring and historically fragile artifact variants.",
+    "purpose": "Catch artifact transport and verifier-environment regressions without running a full benchmark batch."
+  },
+  "tasks": [
+    {
+      "task_id": "ccx-compliance-124",
+      "benchmark": "csb_org_compliance",
+      "task_dir": "csb_org_compliance/ccx-compliance-124"
+    },
+    {
+      "task_id": "ccx-onboard-search-207",
+      "benchmark": "csb_org_onboarding",
+      "task_dir": "csb_org_onboarding/ccx-onboard-search-207"
+    },
+    {
+      "task_id": "ccx-onboard-search-212",
+      "benchmark": "csb_org_onboarding",
+      "task_dir": "csb_org_onboarding/ccx-onboard-search-212"
+    },
+    {
+      "task_id": "envoy-grpc-server-impl-001",
+      "benchmark": "csb_sdlc_feature",
+      "task_dir": "csb_sdlc_feature/envoy-grpc-server-impl-001"
+    },
+    {
+      "task_id": "bustub-hyperloglog-impl-001",
+      "benchmark": "csb_sdlc_feature",
+      "task_dir": "csb_sdlc_feature/bustub-hyperloglog-impl-001"
+    },
+    {
+      "task_id": "nodebb-plugin-validate-fix-001",
+      "benchmark": "csb_sdlc_fix",
+      "task_dir": "csb_sdlc_fix/nodebb-plugin-validate-fix-001"
+    },
+    {
+      "task_id": "django-sensitive-file-exclusion-001",
+      "benchmark": "csb_sdlc_secure",
+      "task_dir": "csb_sdlc_secure/django-sensitive-file-exclusion-001"
+    },
+    {
+      "task_id": "aspnetcore-code-review-001",
+      "benchmark": "csb_sdlc_test",
+      "task_dir": "csb_sdlc_test/aspnetcore-code-review-001"
+    }
+  ]
+}
@@ -1,6 +1,6 @@
 {
   "metadata": {
-    "description": "Curated local-Docker smoke matrix for registry-ready task validation across baseline, sg_only, and artifact_only variants.",
+    "description": "Curated local-Docker smoke matrix for registry-ready task validation across baseline and sg_only variants, including regression sentinel tasks from historically fragile families.",
     "purpose": "Catch task contract drift and common harness portability regressions without running a full benchmark batch."
   },
   "tasks": [
@@ -14,6 +14,11 @@
       "benchmark": "csb_org_onboarding",
       "task_dir": "csb_org_onboarding/ccx-onboard-search-207"
     },
+    {
+      "task_id": "ccx-onboard-search-212",
+      "benchmark": "csb_org_onboarding",
+      "task_dir": "csb_org_onboarding/ccx-onboard-search-212"
+    },
     {
       "task_id": "ansible-galaxy-tar-regression-prove-001",
       "benchmark": "csb_sdlc_debug",
@@ -34,11 +39,26 @@
       "benchmark": "csb_sdlc_feature",
       "task_dir": "csb_sdlc_feature/envoy-grpc-server-impl-001"
     },
+    {
+      "task_id": "bustub-hyperloglog-impl-001",
+      "benchmark": "csb_sdlc_feature",
+      "task_dir": "csb_sdlc_feature/bustub-hyperloglog-impl-001"
+    },
     {
       "task_id": "flink-window-late-data-fix-001",
       "benchmark": "csb_sdlc_fix",
       "task_dir": "csb_sdlc_fix/flink-window-late-data-fix-001"
     },
+    {
+      "task_id": "element-web-unread-indicators-diverge-fix-001",
+      "benchmark": "csb_sdlc_fix",
+      "task_dir": "csb_sdlc_fix/element-web-unread-indicators-diverge-fix-001"
+    },
+    {
+      "task_id": "nodebb-plugin-validate-fix-001",
+      "benchmark": "csb_sdlc_fix",
+      "task_dir": "csb_sdlc_fix/nodebb-plugin-validate-fix-001"
+    },
     {
       "task_id": "django-request-factory-refac-001",
       "benchmark": "csb_sdlc_refactor",
@@ -58,6 +78,11 @@
       "task_id": "numpy-dtype-localize-001",
       "benchmark": "csb_sdlc_understand",
       "task_dir": "csb_sdlc_understand/numpy-dtype-localize-001"
+    },
+    {
+      "task_id": "clickhouse-mergetree-arch-understand-001",
+      "benchmark": "csb_sdlc_understand",
+      "task_dir": "csb_sdlc_understand/clickhouse-mergetree-arch-understand-001"
     }
   ]
 }
@@ -7099,8 +7099,6 @@
       "task_complexity": 0.15,
       "task_complexity_label": "medium",
       "task_complexity_source": "ground_truth_meta_plus_registry",
-      "execution_env": "local_docker_only",
-      "daytona_incompatible_reason": "repo_too_large_for_10gb_sandbox",
       "repo_approx_loc_source": "cloc",
       "repo_cloc_total_files": 24074,
       "repo_cloc_top_languages": [
 
@@ -10,6 +10,7 @@
 #   bash configs/validate_one_per_benchmark.sh --smoke-runtime [--smoke-timeout-sec 300] [--dry-run]
 #   bash configs/validate_one_per_benchmark.sh --sg-only [--smoke-timeout-sec 600] [--dry-run]
 #   bash configs/validate_one_per_benchmark.sh --artifact-only [--smoke-timeout-sec 600] [--dry-run]
+#   bash configs/validate_one_per_benchmark.sh --selection-file configs/registry_smoke_matrix.json --exact-selection --smoke-runtime
 #
 # --sg-only: swaps Dockerfile -> Dockerfile.sg_only before each smoke, then restores.
 #            Implies --smoke-runtime. Tests that sg_only_env images build and verify.
@@ -38,6 +39,7 @@ ARTIFACT_ONLY=false
 SMOKE_TIMEOUT_SEC=300
 SMOKE_TIMEOUT_OVERRIDES="${SMOKE_TIMEOUT_OVERRIDES:-}"
 MAX_CONCURRENT=0
+EXACT_SELECTION=false
 
 while [[ $# -gt 0 ]]; do
     case "$1" in
@@ -75,6 +77,10 @@ while [[ $# -gt 0 ]]; do
             MAX_CONCURRENT="${2:-0}"
             shift 2
             ;;
+        --exact-selection)
+            EXACT_SELECTION=true
+            shift
+            ;;
         *)
             echo "Unknown option: $1"
             exit 1
@@ -152,7 +158,20 @@ wait_for_slot() {
     done
 }
 
-# Extract first task per benchmark into arrays
+# Extract task selection into arrays
+if [ "$EXACT_SELECTION" = true ]; then
+readarray -t TASK_LINES < <(python3 -c "
+import json
+sel = json.load(open('$SELECTION_FILE'))
+archived = set('$ARCHIVED_SUITES'.split())
+tasks = sel['tasks'] if isinstance(sel, dict) and 'tasks' in sel else sel
+for t in tasks:
+    bm = t['benchmark']
+    if bm in archived:
+        continue
+    print(f'{bm}\tbenchmarks/{t[\"task_dir\"]}')
+")
+else
 readarray -t TASK_LINES < <(python3 -c "
 import json
 sel = json.load(open('$SELECTION_FILE'))
@@ -167,6 +186,7 @@ for t in tasks:
         seen.add(bm)
         print(f'{bm}\tbenchmarks/{t[\"task_dir\"]}')
 ")
+fi
 
 echo "=============================================="
 echo "CodeScaleBench Validation Run (parallel)"
@@ -185,11 +205,16 @@ else
     echo "Model:   $MODEL"
 fi
 echo "Selection:$SELECTION_FILE"
+if [ "$EXACT_SELECTION" = true ]; then
+    echo "ModeSel: exact curated tasks"
+else
+    echo "ModeSel: first task per benchmark"
+fi
 if [ "$MAX_CONCURRENT" -gt 0 ]; then
     echo "Parallel: up to ${MAX_CONCURRENT} tasks at once"
-    echo "Tasks:   1 per benchmark (${#TASK_LINES[@]} total, throttled)"
+    echo "Tasks:   ${#TASK_LINES[@]} total, throttled"
 else
-    echo "Tasks:   1 per benchmark (${#TASK_LINES[@]} total, all concurrent)"
+    echo "Tasks:   ${#TASK_LINES[@]} total, all concurrent"
 fi
 echo "Output:  $JOBS_DIR"
 echo ""
 
@@ -63,17 +63,20 @@ bash configs/validate_one_per_benchmark.sh --smoke-runtime --smoke-timeout-sec 3
 # Curated registry smoke for CI/local verification (baseline variant)
 bash configs/validate_one_per_benchmark.sh \
   --selection-file configs/registry_smoke_matrix.json \
+  --exact-selection \
   --smoke-runtime \
   --max-concurrent 2
 
 # Variant-specific smoke
 bash configs/validate_one_per_benchmark.sh \
   --selection-file configs/registry_smoke_matrix.json \
+  --exact-selection \
   --sg-only \
   --max-concurrent 2
 
 bash configs/validate_one_per_benchmark.sh \
-  --selection-file configs/registry_smoke_matrix.json \
+  --selection-file configs/registry_smoke_artifact_matrix.json \
+  --exact-selection \
   --artifact-only \
   --max-concurrent 2
 ```
 
@@ -162,6 +162,8 @@ Before publishing a task, confirm:
 - `task.toml` storage is justified and not inflated by default
 - `python3 scripts/validate_tasks_preflight.py --task <task_dir>` passes
 - new or changed tasks also pass `--smoke-runtime` before large batch use
+- curated smoke coverage includes regression sentinels for task families that
+  previously needed harness or variant fixes
 
 ## CI / Validation Guidance