chore: sync benchmark instructions, preflight checks, and local ignore rules

sjarmak · sjarmak · commit 11765ac5eb4f · 2026-02-17T15:46:47.000Z
diff --git a/.gitignore b/.gitignore
@@ -43,3 +43,7 @@ vendor/dependeval_repos/
 *.pem
 .claude/*
 !.claude/commands/
+
+# Local benchmark agent/session state
+benchmarks/.claude/
+benchmarks/locobench-agent/.claude/
diff --git a/AGENTS.md b/AGENTS.md
@@ -79,4 +79,6 @@ python3 scripts/generate_eval_report.py
 - `configs/_common.sh` - shared run infra (parallelism, token refresh, validation hooks)
 - `configs/*_2config.sh` - per-suite run launchers
 - `configs/validate_one_per_benchmark.sh --smoke-runtime` - quick no-agent runtime smoke (1 task per benchmark)
+  - Smoke interpretation: `smoke_verifier_nonzero_with_reward` is acceptable in no-agent mode; use `--smoke-timeout-overrides "ccb_pytorch=900,ccb_tac=900,ccb_crossrepo=900"` for timeout-heavy suites.
+  - Timeout diagnostics: `smoke_build_timeout` (image build phase) vs `smoke_verify_timeout` (verifier phase).
 - `scripts/promote_run.py` - staging to official promotion flow
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -79,4 +79,6 @@ python3 scripts/generate_eval_report.py
 - `configs/_common.sh` - shared run infra (parallelism, token refresh, validation hooks)
 - `configs/*_2config.sh` - per-suite run launchers
 - `configs/validate_one_per_benchmark.sh --smoke-runtime` - quick no-agent runtime smoke (1 task per benchmark)
+  - Smoke interpretation: `smoke_verifier_nonzero_with_reward` is acceptable in no-agent mode; use `--smoke-timeout-overrides "ccb_pytorch=900,ccb_tac=900,ccb_crossrepo=900"` for timeout-heavy suites.
+  - Timeout diagnostics: `smoke_build_timeout` (image build phase) vs `smoke_verify_timeout` (verifier phase).
 - `scripts/promote_run.py` - staging to official promotion flow
diff --git a/benchmarks/ccb_docgen/docgen-k8s-apiserver-001/instruction.md b/benchmarks/ccb_docgen/docgen-k8s-apiserver-001/instruction.md
@@ -9,18 +9,18 @@ Produce a subsystem architecture and extension guide for the apiserver library w
 
 ## Scope
 
-Primary focus area: `staging/src/k8s.io/apiserver`.
-
-Your document must explain component responsibilities, end-to-end flow, and extension/operational tradeoffs.
-
-## Required Sections
-
-1. **Subsystem Overview** — purpose, boundaries, and upstream/downstream dependencies
-2. **Key Components** — major types/modules and their responsibilities
-3. **End-to-End Flow** — request/control flow with concrete file-backed references
-4. **Failure Modes & Tradeoffs** — common failures, limits, and design tradeoffs
-5. **Extension Points** — where behavior can be customized and associated risks
-6. **Source File Map** — list the most relevant files/directories used in your analysis
+Focus on the Kubernetes apiserver library subsystem.
+Your document must explain component responsibilities, end-to-end flow, and extension or operational tradeoffs.
+
+## Content Expectations
+
+Address all of the following in your own structure:
+- subsystem purpose, boundaries, and upstream/downstream dependencies
+- key components and how responsibilities are split
+- end-to-end control/request flow with concrete repository evidence
+- failure modes, limits, and design tradeoffs
+- extension points, customization hooks, and associated risks
+- a concise map of the most important files/directories used in your analysis
 
 ## Quality Bar
 
diff --git a/benchmarks/ccb_docgen/docgen-k8s-applyconfig-001/instruction.md b/benchmarks/ccb_docgen/docgen-k8s-applyconfig-001/instruction.md
@@ -9,18 +9,18 @@ Produce a deep guide for applyconfigurations and Server-Side Apply semantics, in
 
 ## Scope
 
-Primary focus area: `staging/src/k8s.io/client-go/applyconfigurations`.
-
-Your document must explain component responsibilities, end-to-end flow, and extension/operational tradeoffs.
-
-## Required Sections
-
-1. **Subsystem Overview** — purpose, boundaries, and upstream/downstream dependencies
-2. **Key Components** — major types/modules and their responsibilities
-3. **End-to-End Flow** — request/control flow with concrete file-backed references
-4. **Failure Modes & Tradeoffs** — common failures, limits, and design tradeoffs
-5. **Extension Points** — where behavior can be customized and associated risks
-6. **Source File Map** — list the most relevant files/directories used in your analysis
+Focus on the applyconfigurations and Server-Side Apply subsystem.
+Your document must explain component responsibilities, end-to-end flow, and extension or operational tradeoffs.
+
+## Content Expectations
+
+Address all of the following in your own structure:
+- subsystem purpose, boundaries, and upstream/downstream dependencies
+- key components and how responsibilities are split
+- end-to-end control/request flow with concrete repository evidence
+- failure modes, limits, and design tradeoffs
+- extension points, customization hooks, and associated risks
+- a concise map of the most important files/directories used in your analysis
 
 ## Quality Bar
 
diff --git a/benchmarks/ccb_docgen/docgen-k8s-clientgo-001/instruction.md b/benchmarks/ccb_docgen/docgen-k8s-clientgo-001/instruction.md
@@ -9,18 +9,18 @@ Produce an advanced systems guide for client-go that explains API access, contro
 
 ## Scope
 
-Primary focus area: `staging/src/k8s.io/client-go`.
-
-Your document must explain component responsibilities, end-to-end flow, and extension/operational tradeoffs.
-
-## Required Sections
-
-1. **Subsystem Overview** — purpose, boundaries, and upstream/downstream dependencies
-2. **Key Components** — major types/modules and their responsibilities
-3. **End-to-End Flow** — request/control flow with concrete file-backed references
-4. **Failure Modes & Tradeoffs** — common failures, limits, and design tradeoffs
-5. **Extension Points** — where behavior can be customized and associated risks
-6. **Source File Map** — list the most relevant files/directories used in your analysis
+Focus on the client-go subsystem.
+Your document must explain component responsibilities, end-to-end flow, and extension or operational tradeoffs.
+
+## Content Expectations
+
+Address all of the following in your own structure:
+- subsystem purpose, boundaries, and upstream/downstream dependencies
+- key components and how responsibilities are split
+- end-to-end control/request flow with concrete repository evidence
+- failure modes, limits, and design tradeoffs
+- extension points, customization hooks, and associated risks
+- a concise map of the most important files/directories used in your analysis
 
 ## Quality Bar
 
diff --git a/benchmarks/ccb_docgen/docgen-k8s-cm-001/instruction.md b/benchmarks/ccb_docgen/docgen-k8s-cm-001/instruction.md
@@ -9,18 +9,18 @@ Produce a subsystem architecture guide for kubelet container manager, including
 
 ## Scope
 
-Primary focus area: `pkg/kubelet/cm`.
-
-Your document must explain component responsibilities, end-to-end flow, and extension/operational tradeoffs.
-
-## Required Sections
-
-1. **Subsystem Overview** — purpose, boundaries, and upstream/downstream dependencies
-2. **Key Components** — major types/modules and their responsibilities
-3. **End-to-End Flow** — request/control flow with concrete file-backed references
-4. **Failure Modes & Tradeoffs** — common failures, limits, and design tradeoffs
-5. **Extension Points** — where behavior can be customized and associated risks
-6. **Source File Map** — list the most relevant files/directories used in your analysis
+Focus on the kubelet container manager subsystem.
+Your document must explain component responsibilities, end-to-end flow, and extension or operational tradeoffs.
+
+## Content Expectations
+
+Address all of the following in your own structure:
+- subsystem purpose, boundaries, and upstream/downstream dependencies
+- key components and how responsibilities are split
+- end-to-end control/request flow with concrete repository evidence
+- failure modes, limits, and design tradeoffs
+- extension points, customization hooks, and associated risks
+- a concise map of the most important files/directories used in your analysis
 
 ## Quality Bar
 
diff --git a/benchmarks/ccb_docgen/docgen-k8s-fairqueuing-001/instruction.md b/benchmarks/ccb_docgen/docgen-k8s-fairqueuing-001/instruction.md
@@ -9,18 +9,18 @@ Produce an algorithmic deep-dive on APF QueueSet behavior, dispatch flow, and fa
 
 ## Scope
 
-Primary focus area: `staging/src/k8s.io/apiserver/pkg/util/flowcontrol/fairqueuing/queueset`.
-
-Your document must explain component responsibilities, end-to-end flow, and extension/operational tradeoffs.
-
-## Required Sections
-
-1. **Subsystem Overview** — purpose, boundaries, and upstream/downstream dependencies
-2. **Key Components** — major types/modules and their responsibilities
-3. **End-to-End Flow** — request/control flow with concrete file-backed references
-4. **Failure Modes & Tradeoffs** — common failures, limits, and design tradeoffs
-5. **Extension Points** — where behavior can be customized and associated risks
-6. **Source File Map** — list the most relevant files/directories used in your analysis
+Focus on the APF QueueSet subsystem.
+Your document must explain component responsibilities, end-to-end flow, and extension or operational tradeoffs.
+
+## Content Expectations
+
+Address all of the following in your own structure:
+- subsystem purpose, boundaries, and upstream/downstream dependencies
+- key components and how responsibilities are split
+- end-to-end control/request flow with concrete repository evidence
+- failure modes, limits, and design tradeoffs
+- extension points, customization hooks, and associated risks
+- a concise map of the most important files/directories used in your analysis
 
 ## Quality Bar
 
diff --git a/benchmarks/ccb_security/sec-reach-002/instruction.md b/benchmarks/ccb_security/sec-reach-002/instruction.md
@@ -22,12 +22,6 @@ Determine whether this function is **exploitable from external HTTP input** (i.e
 - **Repository**: envoyproxy/envoy @ v1.31.1
 - **Language**: C++
 - **Focus Area**: Admin interface architecture and access control
-- **Relevant Files**:
-  - `source/server/admin/admin.cc` (admin endpoint handlers)
-  - `source/server/admin/admin.h` (admin interface definition)
-  - `source/server/server.cc` (server initialization, admin interface binding)
-  - `envoy/server/admin.h` (admin interface API)
-  - Configuration files showing admin interface setup
 
 ## Background: Admin Interface vs Data Plane
 
@@ -45,33 +39,13 @@ Envoy has two separate network interfaces:
 
 ## Expected Deliverable
 
-Write your analysis to `/logs/agent/triage.md` with the following sections:
-
-### 1. Summary
-- Function under analysis
-- Overall exploitability assessment (EXPLOITABLE / MITIGATED / UNREACHABLE)
-- Key finding (1-2 sentences)
-
-### 2. Call Path Analysis
-- Complete call chain from HTTP request reception to `parseAuthority()`
-- Entry points (HTTP/1 vs HTTP/2)
-- Any conditional branching in the call path
-
-### 3. Protection Mechanisms
-- Network interface binding configuration (which IP/port?)
-- Access control mechanisms (authentication, IP allowlisting, etc.)
-- Separation between admin interface and data plane listeners
-- How Envoy routes requests to admin handlers vs regular HTTP handlers
-
-### 4. Exploitability Assessment
-- Can an external attacker (from the internet) trigger `/drain_listeners`?
-- What network configuration would be required for an attack to succeed?
-- Is the vulnerability **EXPLOITABLE** (reachable from external traffic) or **UNREACHABLE** (admin-only)?
-
-### 5. Severity Assessment
-- If exploitable: impact (DoS, memory corruption, information disclosure, RCE)
-- If mitigated: description of the protection mechanism
-- Confidence level in your assessment
+Write your analysis to `/logs/agent/triage.md`.
+
+Include enough evidence to support your exploitability conclusion, including:
+- the full call chain from request reception to the target handler
+- the boundary between data-plane and admin-plane request handling
+- the controls that prevent or permit external reachability
+- the network/configuration conditions under which the endpoint would become externally reachable
 
 ## Important Notes
 
diff --git a/benchmarks/ccb_swebenchpro/tasks/instance_nodebb-nodebb-76c6e30282906ac664f2c9278fc90999b27b1f48-vd59a5728dfc977f44533186ace531248c2917516/tests/test.sh b/benchmarks/ccb_swebenchpro/tasks/instance_nodebb-nodebb-76c6e30282906ac664f2c9278fc90999b27b1f48-vd59a5728dfc977f44533186ace531248c2917516/tests/test.sh
diff --git a/benchmarks/ccb_tac/tac-troubleshoot-dev-setup/instruction.md b/benchmarks/ccb_tac/tac-troubleshoot-dev-setup/instruction.md
@@ -13,9 +13,8 @@ requirements documentation and fixing version mismatches.
 
 ## Task
 
-You previously set up an environment to run the copilot-arena-server. The library 
-installation config for your environment is in the `myenv.txt` file located in the 
-/workspace folder. 
+You previously set up an environment to run the copilot-arena-server. The library
+installation config for your environment is in `myenv.txt` within the workspace.
 
 Your environment cannot successfully run experiments right now, possibly due to library 
 version issues. You will need to:
diff --git a/configs/experiments/run_mcp_ablation_taskpack.sh b/configs/experiments/run_mcp_ablation_taskpack.sh
@@ -132,16 +132,35 @@ while IFS=$'\t' read -r bm tid tdir repo; do
   echo "Task: ${tid} (${bm})"
   echo "Dir:  benchmarks/${tdir}"
 
+  ensure_fresh_token_all
+
+  base_pid=""
+  full_pid=""
+
   if [ "$RUN_BASELINE" = true ]; then
-    echo "Mode: baseline"
-    ensure_fresh_token_all
-    run_one "$tdir" "baseline" "$repo"
+    echo "Mode: baseline (paired launch)"
+    run_one "$tdir" "baseline" "$repo" &
+    base_pid=$!
   fi
 
   if [ "$RUN_FULL" = true ]; then
-    echo "Mode: sourcegraph_full"
-    ensure_fresh_token_all
-    run_one "$tdir" "sourcegraph_full" "$repo"
+    echo "Mode: sourcegraph_full (paired launch)"
+    run_one "$tdir" "sourcegraph_full" "$repo" &
+    full_pid=$!
+  fi
+
+  base_rc=0
+  full_rc=0
+  if [ -n "$base_pid" ]; then
+    wait "$base_pid" || base_rc=$?
+  fi
+  if [ -n "$full_pid" ]; then
+    wait "$full_pid" || full_rc=$?
+  fi
+
+  if [ "$base_rc" -ne 0 ] || [ "$full_rc" -ne 0 ]; then
+    echo "ERROR: Paired run failed for task ${tid} (baseline_rc=${base_rc}, sourcegraph_full_rc=${full_rc})"
+    exit 1
   fi
 done < /tmp/mcp_ablation_tasks.tsv
 
diff --git a/configs/validate_one_per_benchmark.sh b/configs/validate_one_per_benchmark.sh
@@ -27,6 +27,7 @@ TIMESTAMP=$(date +%Y%m%d_%H%M%S)
 DRY_RUN=false
 SMOKE_RUNTIME=false
 SMOKE_TIMEOUT_SEC=300
+SMOKE_TIMEOUT_OVERRIDES="${SMOKE_TIMEOUT_OVERRIDES:-ccb_pytorch=900,ccb_tac=900,ccb_crossrepo=900}"
 
 while [[ $# -gt 0 ]]; do
     case "$1" in
@@ -42,6 +43,10 @@ while [[ $# -gt 0 ]]; do
             SMOKE_TIMEOUT_SEC="${2:-}"
             shift 2
             ;;
+        --smoke-timeout-overrides)
+            SMOKE_TIMEOUT_OVERRIDES="${2:-}"
+            shift 2
+            ;;
         *)
             echo "Unknown option: $1"
             exit 1
@@ -84,6 +89,7 @@ echo "=============================================="
 if [ "$SMOKE_RUNTIME" = true ]; then
     echo "Mode:    runtime smoke (no agent)"
     echo "Timeout: ${SMOKE_TIMEOUT_SEC}s per task"
+    echo "Overrides: ${SMOKE_TIMEOUT_OVERRIDES:-<none>}"
 else
     echo "Mode:    baseline harbor run (no MCP)"
     echo "Model:   $MODEL"
@@ -102,10 +108,22 @@ if [ "$DRY_RUN" = true ]; then
     echo "[DRY RUN] Verifying task directories..."
     for line in "${TASK_LINES[@]}"; do
         IFS=$'\t' read -r bm path <<< "$line"
+        TASK_TIMEOUT="$SMOKE_TIMEOUT_SEC"
+        if [ "$SMOKE_RUNTIME" = true ] && [ -n "${SMOKE_TIMEOUT_OVERRIDES:-}" ]; then
+            IFS=',' read -ra __OVR_ARR <<< "$SMOKE_TIMEOUT_OVERRIDES"
+            for __pair in "${__OVR_ARR[@]}"; do
+                __k="${__pair%%=*}"
+                __v="${__pair#*=}"
+                if [ "$__k" = "$bm" ] && [ -n "$__v" ]; then
+                    TASK_TIMEOUT="$__v"
+                    break
+                fi
+            done
+        fi
         if [ -d "$path" ] && [ -f "$path/task.toml" ]; then
             echo "  OK   $path"
             if [ "$SMOKE_RUNTIME" = true ]; then
-                echo "      cmd: python3 scripts/validate_tasks_preflight.py --task $path --smoke-runtime --smoke-timeout-sec $SMOKE_TIMEOUT_SEC --format json"
+                echo "      cmd: python3 scripts/validate_tasks_preflight.py --task $path --smoke-runtime --smoke-timeout-sec $TASK_TIMEOUT --format json"
             else
                 echo "      cmd: BASELINE_MCP_TYPE=none harbor run --path $path --agent-import-path $AGENT_PATH --model $MODEL ..."
             fi
@@ -133,11 +151,23 @@ for line in "${TASK_LINES[@]}"; do
     fi
 
     if [ "$SMOKE_RUNTIME" = true ]; then
+        TASK_TIMEOUT="$SMOKE_TIMEOUT_SEC"
+        if [ -n "${SMOKE_TIMEOUT_OVERRIDES:-}" ]; then
+            IFS=',' read -ra __OVR_ARR <<< "$SMOKE_TIMEOUT_OVERRIDES"
+            for __pair in "${__OVR_ARR[@]}"; do
+                __k="${__pair%%=*}"
+                __v="${__pair#*=}"
+                if [ "$__k" = "$bm" ] && [ -n "$__v" ]; then
+                    TASK_TIMEOUT="$__v"
+                    break
+                fi
+            done
+        fi
         echo "Launching runtime smoke: $bm ($path)"
         python3 scripts/validate_tasks_preflight.py \
             --task "$abs_path" \
             --smoke-runtime \
-            --smoke-timeout-sec "$SMOKE_TIMEOUT_SEC" \
+            --smoke-timeout-sec "$TASK_TIMEOUT" \
             --format json \
             > "$JOBS_DIR/${bm}.log" 2>&1 &
     else
diff --git a/docs/QA_PROCESS.md b/docs/QA_PROCESS.md
@@ -36,13 +36,23 @@ Pre-flight now also supports a **runtime smoke mode** that validates task runtim
 - Builds the task Docker image
 - Runs verifier script (`/tests/test.sh`) in-container
 - Checks reward file creation (`/logs/verifier/reward.txt` or `.json`)
+- Tries both common Docker build contexts automatically (`task root` then `environment/`)
 
 Use this for new/modified tasks and before large reruns involving task-definition changes.
 
+Interpretation notes:
+- `WARNING smoke_verifier_nonzero_with_reward` is acceptable for no-agent smoke (dummy solution expected to fail tests but verifier wiring is healthy).
+- `CRITICAL smoke_build_timeout` means Docker image build exceeded timeout.
+- `CRITICAL smoke_verify_timeout` means verifier execution exceeded timeout.
+
 **Quick sweep helper (one task per benchmark):**
 ```bash
 # No-agent runtime smoke across one representative task per benchmark
 bash configs/validate_one_per_benchmark.sh --smoke-runtime --smoke-timeout-sec 300
+
+# Override timeout-heavy suites (format: suite=seconds,suite=seconds)
+bash configs/validate_one_per_benchmark.sh --smoke-runtime --smoke-timeout-sec 300 \
+  --smoke-timeout-overrides "ccb_pytorch=900,ccb_tac=900,ccb_crossrepo=900"
 ```
 
 **Usage:**
@@ -61,6 +71,10 @@ python3 scripts/validate_tasks_preflight.py --task benchmarks/ccb_largerepo/big-
 
 # Runtime smoke for a suite (expensive)
 python3 scripts/validate_tasks_preflight.py --suite ccb_largerepo --smoke-runtime --smoke-timeout-sec 900
+
+# Separate build/verifier timeouts (for phase-level diagnosis)
+python3 scripts/validate_tasks_preflight.py --task benchmarks/ccb_pytorch/sgt-001 \
+  --smoke-runtime --smoke-build-timeout-sec 900 --smoke-verify-timeout-sec 900
 ```
 
 ---
diff --git a/scripts/validate_tasks_preflight.py b/scripts/validate_tasks_preflight.py