Skip to content

Commit 9754329

Browse files
fix(ci): use completed nightly runs for benchmark baseline, not just successful (#15806)
The benchmark baseline fetch was filtering workflow runs by status=success, which only returns runs where EVERY job succeeded. When any single generator E2E job fails, the entire nightly workflow is marked as failure, hiding valid baseline data from all other generators. This caused stale baselines — e.g., Swift E2E showed 307s (from old successful runs) instead of ~700s (from recent runs that failed due to other generators). Changes: - Use status=completed + jq filter for success/failure conclusions, so runs where some generators failed still contribute valid data for others - Filter out entries with non-zero exit_code in lookup functions to avoid including timings from failed generator runs - Add tests for exit_code filtering Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: barry.zou <barry.zou@buildwithfern.com>
1 parent 740e3a5 commit 9754329

4 files changed

Lines changed: 82 additions & 15 deletions

File tree

.github/scripts/format-benchmark-report.sh

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,10 @@ lookup_e2e_baseline() {
4343
local e2e_file="${run_dir}e2e/${generator}.jsonl"
4444
[ -f "$e2e_file" ] || continue
4545
local dur
46-
dur=$(jq -r --arg spec "$spec" 'select(.spec == $spec) | .duration_seconds' "$e2e_file" 2>/dev/null || true)
46+
# Only include runs that completed successfully (exit_code 0 or absent).
47+
# Failed runs from partially-successful nightly workflows may have
48+
# misleading timings.
49+
dur=$(jq -r --arg spec "$spec" 'select(.spec == $spec and (.exit_code == 0 or .exit_code == null)) | .duration_seconds' "$e2e_file" 2>/dev/null || true)
4750
if [ -n "$dur" ] && [ "$dur" != "null" ] && [ "$dur" != "0" ]; then
4851
durations+=("$dur")
4952
fi
@@ -58,7 +61,7 @@ lookup_e2e_baseline() {
5861
local e2e_file="${MAIN_DIR}/e2e/${generator}.jsonl"
5962
if [ -f "$e2e_file" ]; then
6063
local dur
61-
dur=$(jq -r --arg spec "$spec" 'select(.spec == $spec) | .duration_seconds' "$e2e_file" 2>/dev/null || true)
64+
dur=$(jq -r --arg spec "$spec" 'select(.spec == $spec and (.exit_code == 0 or .exit_code == null)) | .duration_seconds' "$e2e_file" 2>/dev/null || true)
6265
if [ -n "$dur" ] && [ "$dur" != "null" ] && [ "$dur" != "0" ]; then
6366
E2E_BASELINE_VAL="$dur"
6467
E2E_BASELINE_RUNS=1
@@ -84,7 +87,7 @@ lookup_baseline() {
8487
local hist_file="${run_dir}/${generator}.jsonl"
8588
[ -f "$hist_file" ] || continue
8689
local dur
87-
dur=$(jq -r --arg spec "$spec" 'select(.spec == $spec) | .duration_seconds' "$hist_file" 2>/dev/null || true)
90+
dur=$(jq -r --arg spec "$spec" 'select(.spec == $spec and (.exit_code == 0 or .exit_code == null)) | .duration_seconds' "$hist_file" 2>/dev/null || true)
8891
if [ -n "$dur" ] && [ "$dur" != "null" ] && [ "$dur" != "0" ]; then
8992
durations+=("$dur")
9093
fi

.github/scripts/format-docs-benchmark-report.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ lookup_docs_baseline() {
4242
for f in "${run_dir}"*.jsonl; do
4343
[ -f "$f" ] || continue
4444
local dur
45-
dur=$(jq -r --arg spec "$spec" 'select(.spec == $spec) | .duration_seconds' "$f" 2>/dev/null || true)
45+
dur=$(jq -r --arg spec "$spec" 'select(.spec == $spec and (.exit_code == 0 or .exit_code == null)) | .duration_seconds' "$f" 2>/dev/null || true)
4646
if [ -n "$dur" ] && [ "$dur" != "null" ] && [ "$dur" != "0" ]; then
4747
durations+=("$dur")
4848
fi
@@ -57,7 +57,7 @@ lookup_docs_baseline() {
5757
for f in "${BASELINE_DIR}"/*.jsonl; do
5858
[ -f "$f" ] || continue
5959
local dur
60-
dur=$(jq -r --arg spec "$spec" 'select(.spec == $spec) | .duration_seconds' "$f" 2>/dev/null || true)
60+
dur=$(jq -r --arg spec "$spec" 'select(.spec == $spec and (.exit_code == 0 or .exit_code == null)) | .duration_seconds' "$f" 2>/dev/null || true)
6161
if [ -n "$dur" ] && [ "$dur" != "null" ] && [ "$dur" != "0" ]; then
6262
BASELINE_VAL="$dur"
6363
BASELINE_RUNS=1

.github/scripts/test-format-benchmark-report.sh

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,64 @@ test_last_updated_timestamp() {
588588
}
589589

590590
test_last_updated_timestamp
591+
592+
# Test 25: Failed baseline runs (non-zero exit_code) are excluded from median
593+
test_excludes_failed_runs() {
594+
echo "Test: History runs with non-zero exit_code are excluded from baseline"
595+
setup_dirs
596+
echo '{"generator":"swift-sdk","spec":"square","duration_seconds":50,"exit_code":0}' > "$PR_DIR/swift-sdk.jsonl"
597+
598+
# Create 5 history entries. 3 succeeded at ~700s, 2 failed mid-run at ~300s.
599+
# Only the 3 successful runs should contribute to the median (700).
600+
mkdir -p "$MAIN_DIR/history/run1/e2e"
601+
mkdir -p "$MAIN_DIR/history/run2/e2e"
602+
mkdir -p "$MAIN_DIR/history/run3/e2e"
603+
mkdir -p "$MAIN_DIR/history/run4/e2e"
604+
mkdir -p "$MAIN_DIR/history/run5/e2e"
605+
# Generator-only baselines (all succeed)
606+
for r in run1 run2 run3 run4 run5; do
607+
echo '{"generator":"swift-sdk","spec":"square","duration_seconds":42,"exit_code":0}' > "$MAIN_DIR/history/$r/swift-sdk.jsonl"
608+
done
609+
# E2E: 3 successful runs at 690, 700, 710
610+
echo '{"generator":"swift-sdk","spec":"square","duration_seconds":690,"exit_code":0}' > "$MAIN_DIR/history/run1/e2e/swift-sdk.jsonl"
611+
echo '{"generator":"swift-sdk","spec":"square","duration_seconds":700,"exit_code":0}' > "$MAIN_DIR/history/run2/e2e/swift-sdk.jsonl"
612+
echo '{"generator":"swift-sdk","spec":"square","duration_seconds":710,"exit_code":0}' > "$MAIN_DIR/history/run3/e2e/swift-sdk.jsonl"
613+
# E2E: 2 failed runs with misleading short durations
614+
echo '{"generator":"swift-sdk","spec":"square","duration_seconds":150,"exit_code":1}' > "$MAIN_DIR/history/run4/e2e/swift-sdk.jsonl"
615+
echo '{"generator":"swift-sdk","spec":"square","duration_seconds":160,"exit_code":1}' > "$MAIN_DIR/history/run5/e2e/swift-sdk.jsonl"
616+
617+
OUTPUT=$(BASELINE_TIMESTAMP="2026-05-08T04:00:00Z" bash "$REPORT_SCRIPT" "$PR_DIR" "$MAIN_DIR")
618+
619+
# Median of [690, 700, 710] = 700, not polluted by failed runs.
620+
# Generator-only column has n=5 (all succeeded), E2E column has n=3 (2 failed excluded).
621+
assert_contains "$OUTPUT" "700s (n=3)" "E2E median excludes failed runs (n=3 not n=5)"
622+
assert_contains "$OUTPUT" "42s (n=5)" "Generator-only baseline still includes all 5 runs"
623+
}
624+
625+
test_excludes_failed_runs
626+
627+
# Test 26: Generator-only baseline also excludes failed runs
628+
test_excludes_failed_gen_runs() {
629+
echo "Test: Generator-only baseline excludes runs with non-zero exit_code"
630+
setup_dirs
631+
echo '{"generator":"ts-sdk","spec":"square","duration_seconds":100,"exit_code":0}' > "$PR_DIR/ts-sdk.jsonl"
632+
633+
mkdir -p "$MAIN_DIR/history/run1"
634+
mkdir -p "$MAIN_DIR/history/run2"
635+
mkdir -p "$MAIN_DIR/history/run3"
636+
# 2 successful runs: 80, 90 -> median = 85
637+
echo '{"generator":"ts-sdk","spec":"square","duration_seconds":80,"exit_code":0}' > "$MAIN_DIR/history/run1/ts-sdk.jsonl"
638+
echo '{"generator":"ts-sdk","spec":"square","duration_seconds":90,"exit_code":0}' > "$MAIN_DIR/history/run2/ts-sdk.jsonl"
639+
# 1 failed run with short duration
640+
echo '{"generator":"ts-sdk","spec":"square","duration_seconds":10,"exit_code":1}' > "$MAIN_DIR/history/run3/ts-sdk.jsonl"
641+
642+
OUTPUT=$(BASELINE_TIMESTAMP="2026-05-08T04:00:00Z" bash "$REPORT_SCRIPT" "$PR_DIR" "$MAIN_DIR")
643+
644+
assert_contains "$OUTPUT" "85s (n=2)" "Generator-only median excludes failed run"
645+
}
646+
647+
test_excludes_failed_gen_runs
648+
591649
echo ""
592650
echo "=== PostHog JSON validation tests ==="
593651
echo ""

.github/workflows/seed.yml

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1002,18 +1002,22 @@ jobs:
10021002
10031003
BASELINE_RUNS_TO_FETCH=5
10041004
1005-
# Fetch the last N successful runs of benchmark-baseline.yml on main
1006-
RUNS=$(gh api "repos/${{ github.repository }}/actions/workflows/benchmark-baseline.yml/runs?status=success&per_page=${BASELINE_RUNS_TO_FETCH}&branch=main" \
1007-
--jq '.workflow_runs' 2>/dev/null || echo "[]")
1005+
# Fetch the last N completed runs of benchmark-baseline.yml on main.
1006+
# Use status=completed (not status=success) so we include runs where
1007+
# some generators failed but others produced valid artifacts. A single
1008+
# failing E2E job marks the whole workflow as "failure", which would
1009+
# hide valid baseline data from every other generator.
1010+
RUNS=$(gh api "repos/${{ github.repository }}/actions/workflows/benchmark-baseline.yml/runs?status=completed&per_page=20&branch=main" \
1011+
--jq '[.workflow_runs[] | select(.conclusion == "success" or .conclusion == "failure")] | .['":${BASELINE_RUNS_TO_FETCH}"']' 2>/dev/null || echo "[]")
10081012
10091013
RUN_COUNT=$(echo "$RUNS" | jq 'length')
10101014
if [ "$RUN_COUNT" -eq 0 ]; then
1011-
echo "::warning::No successful nightly baseline runs found. Report will show N/A for main timings."
1015+
echo "::warning::No completed nightly baseline runs found. Report will show N/A for main timings."
10121016
echo "Trigger the benchmark-baseline workflow manually to generate initial baselines."
10131017
exit 0
10141018
fi
10151019
1016-
echo "Found ${RUN_COUNT} successful nightly baseline run(s)"
1020+
echo "Found ${RUN_COUNT} nightly baseline run(s)"
10171021
10181022
# Save metadata from the latest run for the report footer
10191023
LATEST_DATE=$(echo "$RUNS" | jq -r '.[0].created_at')
@@ -1260,17 +1264,19 @@ jobs:
12601264
12611265
BASELINE_RUNS_TO_FETCH=5
12621266
1263-
# Fetch the last N successful runs of benchmark-baseline.yml on main
1264-
RUNS=$(gh api "repos/${{ github.repository }}/actions/workflows/benchmark-baseline.yml/runs?status=success&per_page=${BASELINE_RUNS_TO_FETCH}&branch=main" \
1265-
--jq '.workflow_runs' 2>/dev/null || echo "[]")
1267+
# Fetch the last N completed runs of benchmark-baseline.yml on main.
1268+
# Use status=completed (not status=success) — see SDK benchmark step
1269+
# comment for rationale.
1270+
RUNS=$(gh api "repos/${{ github.repository }}/actions/workflows/benchmark-baseline.yml/runs?status=completed&per_page=20&branch=main" \
1271+
--jq '[.workflow_runs[] | select(.conclusion == "success" or .conclusion == "failure")] | .['":${BASELINE_RUNS_TO_FETCH}"']' 2>/dev/null || echo "[]")
12661272
12671273
RUN_COUNT=$(echo "$RUNS" | jq 'length')
12681274
if [ "$RUN_COUNT" -eq 0 ]; then
1269-
echo "::warning::No successful nightly baseline runs found. Report will show N/A for main timings."
1275+
echo "::warning::No completed nightly baseline runs found. Report will show N/A for main timings."
12701276
exit 0
12711277
fi
12721278
1273-
echo "Found ${RUN_COUNT} successful nightly baseline run(s)"
1279+
echo "Found ${RUN_COUNT} nightly baseline run(s)"
12741280
12751281
# Save metadata from the latest run for the report footer
12761282
LATEST_DATE=$(echo "$RUNS" | jq -r '.[0].created_at')

0 commit comments

Comments
 (0)