fix(ci): use completed nightly runs for benchmark baseline, not just successful (#15806)

devin-ai-integration[bot] · lifanzou · web-flow · commit 9754329ac3b8 · 2026-05-08T07:54:41.000-07:00
The benchmark baseline fetch was filtering workflow runs by status=success,
which only returns runs where EVERY job succeeded. When any single generator
E2E job fails, the entire nightly workflow is marked as failure, hiding valid
baseline data from all other generators.

This caused stale baselines — e.g., Swift E2E showed 307s (from old successful
runs) instead of ~700s (from recent runs that failed due to other generators).

Changes:
- Use status=completed + jq filter for success/failure conclusions, so runs
  where some generators failed still contribute valid data for others
- Filter out entries with non-zero exit_code in lookup functions to avoid
  including timings from failed generator runs
- Add tests for exit_code filtering

Co-authored-by: Devin AI &lt;158243242+devin-ai-integration[bot]@users.noreply.github.com&gt;
Co-authored-by: barry.zou &lt;barry.zou@buildwithfern.com&gt;
diff --git a/.github/scripts/format-benchmark-report.sh b/.github/scripts/format-benchmark-report.sh
@@ -43,7 +43,10 @@ lookup_e2e_baseline() {
       local e2e_file="${run_dir}e2e/${generator}.jsonl"
       [ -f "$e2e_file" ] || continue
       local dur
-      dur=$(jq -r --arg spec "$spec" 'select(.spec == $spec) | .duration_seconds' "$e2e_file" 2>/dev/null || true)
+      # Only include runs that completed successfully (exit_code 0 or absent).
+      # Failed runs from partially-successful nightly workflows may have
+      # misleading timings.
+      dur=$(jq -r --arg spec "$spec" 'select(.spec == $spec and (.exit_code == 0 or .exit_code == null)) | .duration_seconds' "$e2e_file" 2>/dev/null || true)
       if [ -n "$dur" ] && [ "$dur" != "null" ] && [ "$dur" != "0" ]; then
         durations+=("$dur")
       fi
@@ -58,7 +61,7 @@ lookup_e2e_baseline() {
     local e2e_file="${MAIN_DIR}/e2e/${generator}.jsonl"
     if [ -f "$e2e_file" ]; then
       local dur
-      dur=$(jq -r --arg spec "$spec" 'select(.spec == $spec) | .duration_seconds' "$e2e_file" 2>/dev/null || true)
+      dur=$(jq -r --arg spec "$spec" 'select(.spec == $spec and (.exit_code == 0 or .exit_code == null)) | .duration_seconds' "$e2e_file" 2>/dev/null || true)
       if [ -n "$dur" ] && [ "$dur" != "null" ] && [ "$dur" != "0" ]; then
         E2E_BASELINE_VAL="$dur"
         E2E_BASELINE_RUNS=1
@@ -84,7 +87,7 @@ lookup_baseline() {
       local hist_file="${run_dir}/${generator}.jsonl"
       [ -f "$hist_file" ] || continue
       local dur
-      dur=$(jq -r --arg spec "$spec" 'select(.spec == $spec) | .duration_seconds' "$hist_file" 2>/dev/null || true)
+      dur=$(jq -r --arg spec "$spec" 'select(.spec == $spec and (.exit_code == 0 or .exit_code == null)) | .duration_seconds' "$hist_file" 2>/dev/null || true)
       if [ -n "$dur" ] && [ "$dur" != "null" ] && [ "$dur" != "0" ]; then
         durations+=("$dur")
       fi
diff --git a/.github/scripts/format-docs-benchmark-report.sh b/.github/scripts/format-docs-benchmark-report.sh
@@ -42,7 +42,7 @@ lookup_docs_baseline() {
       for f in "${run_dir}"*.jsonl; do
         [ -f "$f" ] || continue
         local dur
-        dur=$(jq -r --arg spec "$spec" 'select(.spec == $spec) | .duration_seconds' "$f" 2>/dev/null || true)
+        dur=$(jq -r --arg spec "$spec" 'select(.spec == $spec and (.exit_code == 0 or .exit_code == null)) | .duration_seconds' "$f" 2>/dev/null || true)
         if [ -n "$dur" ] && [ "$dur" != "null" ] && [ "$dur" != "0" ]; then
           durations+=("$dur")
         fi
@@ -57,7 +57,7 @@ lookup_docs_baseline() {
     for f in "${BASELINE_DIR}"/*.jsonl; do
       [ -f "$f" ] || continue
       local dur
-      dur=$(jq -r --arg spec "$spec" 'select(.spec == $spec) | .duration_seconds' "$f" 2>/dev/null || true)
+      dur=$(jq -r --arg spec "$spec" 'select(.spec == $spec and (.exit_code == 0 or .exit_code == null)) | .duration_seconds' "$f" 2>/dev/null || true)
       if [ -n "$dur" ] && [ "$dur" != "null" ] && [ "$dur" != "0" ]; then
         BASELINE_VAL="$dur"
         BASELINE_RUNS=1
diff --git a/.github/scripts/test-format-benchmark-report.sh b/.github/scripts/test-format-benchmark-report.sh
@@ -588,6 +588,64 @@ test_last_updated_timestamp() {
 }
 
 test_last_updated_timestamp
+
+# Test 25: Failed baseline runs (non-zero exit_code) are excluded from median
+test_excludes_failed_runs() {
+  echo "Test: History runs with non-zero exit_code are excluded from baseline"
+  setup_dirs
+  echo '{"generator":"swift-sdk","spec":"square","duration_seconds":50,"exit_code":0}' > "$PR_DIR/swift-sdk.jsonl"
+
+  # Create 5 history entries. 3 succeeded at ~700s, 2 failed mid-run at ~300s.
+  # Only the 3 successful runs should contribute to the median (700).
+  mkdir -p "$MAIN_DIR/history/run1/e2e"
+  mkdir -p "$MAIN_DIR/history/run2/e2e"
+  mkdir -p "$MAIN_DIR/history/run3/e2e"
+  mkdir -p "$MAIN_DIR/history/run4/e2e"
+  mkdir -p "$MAIN_DIR/history/run5/e2e"
+  # Generator-only baselines (all succeed)
+  for r in run1 run2 run3 run4 run5; do
+    echo '{"generator":"swift-sdk","spec":"square","duration_seconds":42,"exit_code":0}' > "$MAIN_DIR/history/$r/swift-sdk.jsonl"
+  done
+  # E2E: 3 successful runs at 690, 700, 710
+  echo '{"generator":"swift-sdk","spec":"square","duration_seconds":690,"exit_code":0}' > "$MAIN_DIR/history/run1/e2e/swift-sdk.jsonl"
+  echo '{"generator":"swift-sdk","spec":"square","duration_seconds":700,"exit_code":0}' > "$MAIN_DIR/history/run2/e2e/swift-sdk.jsonl"
+  echo '{"generator":"swift-sdk","spec":"square","duration_seconds":710,"exit_code":0}' > "$MAIN_DIR/history/run3/e2e/swift-sdk.jsonl"
+  # E2E: 2 failed runs with misleading short durations
+  echo '{"generator":"swift-sdk","spec":"square","duration_seconds":150,"exit_code":1}' > "$MAIN_DIR/history/run4/e2e/swift-sdk.jsonl"
+  echo '{"generator":"swift-sdk","spec":"square","duration_seconds":160,"exit_code":1}' > "$MAIN_DIR/history/run5/e2e/swift-sdk.jsonl"
+
+  OUTPUT=$(BASELINE_TIMESTAMP="2026-05-08T04:00:00Z" bash "$REPORT_SCRIPT" "$PR_DIR" "$MAIN_DIR")
+
+  # Median of [690, 700, 710] = 700, not polluted by failed runs.
+  # Generator-only column has n=5 (all succeeded), E2E column has n=3 (2 failed excluded).
+  assert_contains "$OUTPUT" "700s (n=3)" "E2E median excludes failed runs (n=3 not n=5)"
+  assert_contains "$OUTPUT" "42s (n=5)" "Generator-only baseline still includes all 5 runs"
+}
+
+test_excludes_failed_runs
+
+# Test 26: Generator-only baseline also excludes failed runs
+test_excludes_failed_gen_runs() {
+  echo "Test: Generator-only baseline excludes runs with non-zero exit_code"
+  setup_dirs
+  echo '{"generator":"ts-sdk","spec":"square","duration_seconds":100,"exit_code":0}' > "$PR_DIR/ts-sdk.jsonl"
+
+  mkdir -p "$MAIN_DIR/history/run1"
+  mkdir -p "$MAIN_DIR/history/run2"
+  mkdir -p "$MAIN_DIR/history/run3"
+  # 2 successful runs: 80, 90 -> median = 85
+  echo '{"generator":"ts-sdk","spec":"square","duration_seconds":80,"exit_code":0}' > "$MAIN_DIR/history/run1/ts-sdk.jsonl"
+  echo '{"generator":"ts-sdk","spec":"square","duration_seconds":90,"exit_code":0}' > "$MAIN_DIR/history/run2/ts-sdk.jsonl"
+  # 1 failed run with short duration
+  echo '{"generator":"ts-sdk","spec":"square","duration_seconds":10,"exit_code":1}' > "$MAIN_DIR/history/run3/ts-sdk.jsonl"
+
+  OUTPUT=$(BASELINE_TIMESTAMP="2026-05-08T04:00:00Z" bash "$REPORT_SCRIPT" "$PR_DIR" "$MAIN_DIR")
+
+  assert_contains "$OUTPUT" "85s (n=2)" "Generator-only median excludes failed run"
+}
+
+test_excludes_failed_gen_runs
+
 echo ""
 echo "=== PostHog JSON validation tests ==="
 echo ""
diff --git a/.github/workflows/seed.yml b/.github/workflows/seed.yml
@@ -1002,18 +1002,22 @@ jobs:
 
           BASELINE_RUNS_TO_FETCH=5
 
-          # Fetch the last N successful runs of benchmark-baseline.yml on main
-          RUNS=$(gh api "repos/${{ github.repository }}/actions/workflows/benchmark-baseline.yml/runs?status=success&per_page=${BASELINE_RUNS_TO_FETCH}&branch=main" \
-            --jq '.workflow_runs' 2>/dev/null || echo "[]")
+          # Fetch the last N completed runs of benchmark-baseline.yml on main.
+          # Use status=completed (not status=success) so we include runs where
+          # some generators failed but others produced valid artifacts. A single
+          # failing E2E job marks the whole workflow as "failure", which would
+          # hide valid baseline data from every other generator.
+          RUNS=$(gh api "repos/${{ github.repository }}/actions/workflows/benchmark-baseline.yml/runs?status=completed&per_page=20&branch=main" \
+            --jq '[.workflow_runs[] | select(.conclusion == "success" or .conclusion == "failure")] | .['":${BASELINE_RUNS_TO_FETCH}"']' 2>/dev/null || echo "[]")
 
           RUN_COUNT=$(echo "$RUNS" | jq 'length')
           if [ "$RUN_COUNT" -eq 0 ]; then
-            echo "::warning::No successful nightly baseline runs found. Report will show N/A for main timings."
+            echo "::warning::No completed nightly baseline runs found. Report will show N/A for main timings."
             echo "Trigger the benchmark-baseline workflow manually to generate initial baselines."
             exit 0
           fi
 
-          echo "Found ${RUN_COUNT} successful nightly baseline run(s)"
+          echo "Found ${RUN_COUNT} nightly baseline run(s)"
 
           # Save metadata from the latest run for the report footer
           LATEST_DATE=$(echo "$RUNS" | jq -r '.[0].created_at')
@@ -1260,17 +1264,19 @@ jobs:
 
           BASELINE_RUNS_TO_FETCH=5
 
-          # Fetch the last N successful runs of benchmark-baseline.yml on main
-          RUNS=$(gh api "repos/${{ github.repository }}/actions/workflows/benchmark-baseline.yml/runs?status=success&per_page=${BASELINE_RUNS_TO_FETCH}&branch=main" \
-            --jq '.workflow_runs' 2>/dev/null || echo "[]")
+          # Fetch the last N completed runs of benchmark-baseline.yml on main.
+          # Use status=completed (not status=success) — see SDK benchmark step
+          # comment for rationale.
+          RUNS=$(gh api "repos/${{ github.repository }}/actions/workflows/benchmark-baseline.yml/runs?status=completed&per_page=20&branch=main" \
+            --jq '[.workflow_runs[] | select(.conclusion == "success" or .conclusion == "failure")] | .['":${BASELINE_RUNS_TO_FETCH}"']' 2>/dev/null || echo "[]")
 
           RUN_COUNT=$(echo "$RUNS" | jq 'length')
           if [ "$RUN_COUNT" -eq 0 ]; then
-            echo "::warning::No successful nightly baseline runs found. Report will show N/A for main timings."
+            echo "::warning::No completed nightly baseline runs found. Report will show N/A for main timings."
             exit 0
           fi
 
-          echo "Found ${RUN_COUNT} successful nightly baseline run(s)"
+          echo "Found ${RUN_COUNT} nightly baseline run(s)"
 
           # Save metadata from the latest run for the report footer
           LATEST_DATE=$(echo "$RUNS" | jq -r '.[0].created_at')