Skip to content

Commit f742c1f

Browse files
authored
Compress data (#129)
1 parent 4442123 commit f742c1f

3 files changed

Lines changed: 253 additions & 67 deletions

File tree

.github/workflows/process_results.yaml

Lines changed: 69 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,18 @@ jobs:
4040
ref: "${{ env.BRANCH_NAME }}"
4141
path: asv-runner/
4242

43+
# Idempotent migration from the legacy loose-files layout to compressed
44+
# tarballs. No-op once the tarballs exist.
45+
- name: Migrate legacy storage layout
46+
run: |
47+
if [ -d asv-runner/data/results ] && [ ! -f asv-runner/data/results.tar.zst ]; then
48+
tar -C asv-runner/data -I 'zstd -19' -cf asv-runner/data/results.tar.zst results
49+
fi
50+
if [ -d asv-runner/data/envs ] && [ ! -f asv-runner/data/envs.tar.zst ]; then
51+
tar -C asv-runner/data -I 'zstd -19' -cf asv-runner/data/envs.tar.zst envs
52+
fi
53+
rm -rf asv-runner/data/results asv-runner/data/envs
54+
4355
- name: Setup Python
4456
uses: actions/setup-python@v6
4557
with:
@@ -54,8 +66,14 @@ jobs:
5466
- name: Show environment packages
5567
run: uv pip freeze
5668

57-
- name: Copy results directory
58-
run: cp -r asv-runner/data/results asv_bench/results
69+
# Extract results.tar.zst into asv_bench/ so `asv publish` and
70+
# process_results.py see the same paths as before. Extract envs.tar.zst
71+
# back under asv-runner/data/envs/ so make_issues.py can read per-SHA
72+
# env files. The extracted dirs are gitignored.
73+
- name: Extract result and env archives
74+
run: |
75+
tar -I zstd -xf asv-runner/data/results.tar.zst -C asv_bench/
76+
tar -I zstd -xf asv-runner/data/envs.tar.zst -C asv-runner/data/
5977
6078
- name: Publish ASV Benchmarks
6179
run: cd asv_bench && asv publish
@@ -71,14 +89,53 @@ jobs:
7189
env:
7290
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
7391

74-
- name: Update asv-runner branch
75-
# In case there was a push by another job.
76-
run: cd asv-runner && git fetch && git pull
92+
# Save our parquet+docs, refetch the latest tarballs (a concurrent
93+
# run_asvs merge may have updated them while asv publish was running),
94+
# restore our parquet+docs over the fresh tarballs, then orphan
95+
# force-push. Retry on lease failure.
96+
- name: Push results
97+
run: |
98+
SAVE=$(mktemp -d)
99+
cp -r asv-runner/data/results.parquet "$SAVE/"
100+
cp -r asv-runner/docs "$SAVE/"
77101
78-
- name: Commit results to branch
79-
uses: stefanzweifel/git-auto-commit-action@v5
80-
with:
81-
commit_message: Results
82-
branch: ${{ env.BRANCH_NAME }}
83-
repository: asv-runner
84-
file_pattern: 'data/results.parquet docs/'
102+
for attempt in $(seq 1 5); do
103+
cd asv-runner
104+
git fetch origin ${BRANCH_NAME}
105+
git checkout -B ${BRANCH_NAME} origin/${BRANCH_NAME}
106+
git config user.name "github-actions[bot]"
107+
git config user.email "github-actions[bot]@users.noreply.github.com"
108+
109+
# Idempotent migration from legacy loose-files layout.
110+
if [ -d data/results ] && [ ! -f data/results.tar.zst ]; then
111+
tar -C data -I 'zstd -19' -cf data/results.tar.zst results
112+
fi
113+
if [ -d data/envs ] && [ ! -f data/envs.tar.zst ]; then
114+
tar -C data -I 'zstd -19' -cf data/envs.tar.zst envs
115+
fi
116+
rm -rf data/results data/envs
117+
118+
# Restore our parquet+docs over the freshly fetched tarballs.
119+
rm -rf data/results.parquet docs
120+
cp -r "$SAVE/results.parquet" data/results.parquet
121+
cp -r "$SAVE/docs" docs
122+
123+
EXPECTED=$(git rev-parse origin/${BRANCH_NAME})
124+
git checkout --orphan fresh
125+
git add -A
126+
git commit -m "Results"
127+
git branch -M fresh ${BRANCH_NAME}
128+
129+
if git push --force-with-lease=${BRANCH_NAME}:${EXPECTED} origin ${BRANCH_NAME}; then
130+
echo "Push succeeded on attempt $attempt"
131+
cd ..
132+
exit 0
133+
fi
134+
135+
cd ..
136+
echo "Push race lost on attempt $attempt; retrying"
137+
sleep $((attempt * 5 + RANDOM % 10))
138+
done
139+
140+
echo "Failed to push after 5 attempts" >&2
141+
exit 1

.github/workflows/run_asvs_2026_01_04.yaml

Lines changed: 182 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,20 @@ permissions:
1919
contents: read
2020

2121
jobs:
22-
asv:
23-
name: Run ASVs
22+
# Pick the next un-benchmarked SHA and reserve it via shas.txt.
23+
# Fast (~seconds) so concurrent claims rarely collide; the retry loop
24+
# below covers the case when they do.
25+
claim:
26+
name: Claim SHA
2427
runs-on: ubuntu-24.04
2528
defaults:
2629
run:
2730
shell: bash -el {0}
2831
permissions:
2932
contents: write
30-
issues: write
3133
outputs:
32-
new_commit: ${{ steps.get-commit.outputs.new_commit }}
34+
sha: ${{ steps.claim.outputs.sha }}
35+
new_commit: ${{ steps.claim.outputs.new_commit }}
3336
steps:
3437
# In order to run pandas' actions, we have to checkout into the root directory.
3538
- name: Checkout pandas
@@ -44,74 +47,198 @@ jobs:
4447
ref: ${{ env.BRANCH_NAME }}
4548
path: asv-runner/
4649

47-
- name: Set up pixi
48-
uses: ./.github/actions/setup-pixi
49-
with:
50-
environment: "asv"
51-
52-
- name: Get Commit
53-
id: get-commit
50+
- name: Claim next SHA
51+
id: claim
5452
run: |
55-
sha="$(python asv-runner/ci/find_commit_to_run.py --input-path=asv-runner/data/ --repo-path=.)"
56-
echo "sha: $sha"
57-
if [ "$sha" = "NONE" ]; then
58-
echo "new_commit=no"
59-
echo "new_commit=no" >> "$GITHUB_OUTPUT"
60-
else
61-
echo "new_commit=yes"
62-
echo "new_commit=yes" >> "$GITHUB_OUTPUT"
53+
claimed=false
54+
new_commit=no
55+
sha=NONE
56+
for attempt in $(seq 1 5); do
57+
cd asv-runner
58+
git fetch origin ${BRANCH_NAME}
59+
git checkout -B ${BRANCH_NAME} origin/${BRANCH_NAME}
60+
git config user.name "github-actions[bot]"
61+
git config user.email "github-actions[bot]@users.noreply.github.com"
62+
63+
# One-shot migration from the legacy loose-files layout. Idempotent.
64+
if [ -d data/results ] && [ ! -f data/results.tar.zst ]; then
65+
tar -C data -I 'zstd -19' -cf data/results.tar.zst results
66+
fi
67+
if [ -d data/envs ] && [ ! -f data/envs.tar.zst ]; then
68+
tar -C data -I 'zstd -19' -cf data/envs.tar.zst envs
69+
fi
70+
rm -rf data/results data/envs
71+
72+
cd ..
73+
sha="$(python3 asv-runner/ci/find_commit_to_run.py --input-path=asv-runner/data/ --repo-path=.)"
74+
if [ "$sha" = "NONE" ]; then
75+
new_commit=no
76+
claimed=true
77+
break
78+
fi
79+
6380
echo "$sha" >> asv-runner/data/shas.txt
64-
git checkout $sha
81+
82+
cd asv-runner
83+
EXPECTED=$(git rev-parse origin/${BRANCH_NAME})
84+
git checkout --orphan fresh
85+
git add -A
86+
git commit -m "Update shas.txt"
87+
git branch -M fresh ${BRANCH_NAME}
88+
89+
if git push --force-with-lease=${BRANCH_NAME}:${EXPECTED} origin ${BRANCH_NAME}; then
90+
new_commit=yes
91+
claimed=true
92+
cd ..
93+
break
94+
fi
95+
96+
cd ..
97+
echo "Claim race lost on attempt $attempt; retrying"
98+
sleep $((attempt * 5 + RANDOM % 10))
99+
done
100+
101+
if [ "$claimed" = "false" ]; then
102+
echo "Failed to claim a commit after retries" >&2
103+
exit 1
65104
fi
66105
67-
- name: Update asv-runner branch before committing shas.txt
68-
# In case there was a push by another job.
69-
if: ${{ steps.get-commit.outputs.new_commit == 'yes' }}
70-
run: cd asv-runner && git fetch && git pull
106+
echo "new_commit=$new_commit" >> "$GITHUB_OUTPUT"
107+
echo "sha=$sha" >> "$GITHUB_OUTPUT"
108+
109+
# Build pandas at the claimed SHA and run the ASV suite. No concurrency
110+
# lock here — multiple workflow runs benchmark different SHAs in parallel.
111+
# Outputs are uploaded as a workflow artifact for the merge job.
112+
benchmark:
113+
name: Run benchmarks
114+
needs: claim
115+
if: needs.claim.outputs.new_commit == 'yes'
116+
runs-on: ubuntu-24.04
117+
defaults:
118+
run:
119+
shell: bash -el {0}
120+
env:
121+
SHA: ${{ needs.claim.outputs.sha }}
122+
steps:
123+
- name: Checkout pandas
124+
uses: actions/checkout@v6
125+
with:
126+
repository: pandas-dev/pandas
127+
fetch-depth: 0
71128

72-
# Prevent another job from kicking off and running on this commit
73-
- name: Commit shas.txt
74-
if: ${{ steps.get-commit.outputs.new_commit == 'yes' }}
75-
uses: stefanzweifel/git-auto-commit-action@v5
129+
- name: Set up pixi
130+
uses: ./.github/actions/setup-pixi
76131
with:
77-
commit_message: Update shas.txt
78-
branch: ${{ env.BRANCH_NAME }}
79-
repository: asv-runner
80-
file_pattern: 'data/shas.txt'
132+
environment: "asv"
133+
134+
- name: Checkout target SHA
135+
run: git checkout $SHA
81136

82137
- name: Build pandas
83-
if: ${{ steps.get-commit.outputs.new_commit == 'yes' }}
84138
run: pixi run --environment asv build-pandas --editable -Csetup-args="--werror"
85139

86140
- name: Run ASV Benchmarks
87-
if: ${{ steps.get-commit.outputs.new_commit == 'yes' }}
88141
run: |
89142
eval "$(pixi shell-hook -e asv)"
90143
cd asv_bench
91144
python -m asv machine --machine=asvrunner --yes
92-
python -m asv run --machine=asvrunner --python=same --set-commit-hash=$(git rev-parse HEAD) --show-stderr
93-
94-
- name: Update asv-runner branch
95-
# In case there was a push by another job.
96-
if: ${{ steps.get-commit.outputs.new_commit == 'yes' }}
97-
run: cd asv-runner && git fetch && git pull
145+
python -m asv run --machine=asvrunner --python=same --set-commit-hash=$SHA --show-stderr
98146
99-
- name: Move results into asv-runner
100-
if: ${{ steps.get-commit.outputs.new_commit == 'yes' }}
147+
- name: Stage outputs for upload
101148
run: |
102-
mkdir -p asv-runner/data/results/asvrunner/
103-
cp asv_bench/results/benchmarks.json asv-runner/data/results/
104-
cp asv_bench/results/asvrunner/machine.json asv-runner/data/results/asvrunner/
105-
cp asv_bench/results/asvrunner/$(git rev-parse --short=8 HEAD)-existing*.json asv-runner/data/results/asvrunner/$(git rev-parse HEAD).json
149+
SHORT_SHA=$(git rev-parse --short=8 $SHA)
150+
mkdir -p /tmp/upload/results/asvrunner /tmp/upload/envs
151+
cp asv_bench/results/benchmarks.json /tmp/upload/results/
152+
cp asv_bench/results/asvrunner/machine.json /tmp/upload/results/asvrunner/
153+
cp asv_bench/results/asvrunner/${SHORT_SHA}-existing*.json /tmp/upload/results/asvrunner/${SHA}.json
154+
pixi list -e asv --fields name,version > /tmp/upload/envs/${SHA}.yml
155+
156+
- name: Upload artifact
157+
uses: actions/upload-artifact@v4
158+
with:
159+
name: asv-results-${{ needs.claim.outputs.sha }}
160+
path: /tmp/upload/
106161

107-
mkdir -p asv-runner/data/envs/
108-
pixi list -e asv --fields name,version > asv-runner/data/envs/$(git rev-parse HEAD).yml
162+
# Layer this run's results into the latest tarballs and orphan force-push.
163+
# If a concurrent merge wins the race (lease fails), refetch — that
164+
# winner's data is now in the tarballs we extract — re-add ours on top
165+
# and try again.
166+
merge:
167+
name: Merge into storage branch
168+
needs: [claim, benchmark]
169+
if: needs.claim.outputs.new_commit == 'yes'
170+
runs-on: ubuntu-24.04
171+
defaults:
172+
run:
173+
shell: bash -el {0}
174+
permissions:
175+
contents: write
176+
env:
177+
SHA: ${{ needs.claim.outputs.sha }}
178+
steps:
179+
- name: Checkout asv-runner results branch
180+
uses: actions/checkout@v6
181+
with:
182+
ref: ${{ env.BRANCH_NAME }}
183+
path: asv-runner/
109184

110-
- name: Commit results to branch
111-
if: ${{ steps.get-commit.outputs.new_commit == 'yes' }}
112-
uses: stefanzweifel/git-auto-commit-action@v5
185+
- name: Download benchmark output
186+
uses: actions/download-artifact@v4
113187
with:
114-
commit_message: Results
115-
branch: ${{ env.BRANCH_NAME }}
116-
repository: asv-runner
117-
file_pattern: 'data/results/ data/envs/'
188+
name: asv-results-${{ needs.claim.outputs.sha }}
189+
path: /tmp/new
190+
191+
- name: Merge results and push
192+
run: |
193+
for attempt in $(seq 1 5); do
194+
cd asv-runner
195+
git fetch origin ${BRANCH_NAME}
196+
git checkout -B ${BRANCH_NAME} origin/${BRANCH_NAME}
197+
git config user.name "github-actions[bot]"
198+
git config user.email "github-actions[bot]@users.noreply.github.com"
199+
200+
# Idempotent migration from legacy loose-files layout.
201+
if [ -d data/results ] && [ ! -f data/results.tar.zst ]; then
202+
tar -C data -I 'zstd -19' -cf data/results.tar.zst results
203+
fi
204+
if [ -d data/envs ] && [ ! -f data/envs.tar.zst ]; then
205+
tar -C data -I 'zstd -19' -cf data/envs.tar.zst envs
206+
fi
207+
rm -rf data/results data/envs
208+
209+
WORK=$(mktemp -d)
210+
mkdir -p "$WORK/results/asvrunner" "$WORK/envs"
211+
if [ -f data/results.tar.zst ]; then
212+
tar -I zstd -xf data/results.tar.zst -C "$WORK"
213+
fi
214+
if [ -f data/envs.tar.zst ]; then
215+
tar -I zstd -xf data/envs.tar.zst -C "$WORK"
216+
fi
217+
218+
cp /tmp/new/results/benchmarks.json "$WORK/results/"
219+
cp /tmp/new/results/asvrunner/machine.json "$WORK/results/asvrunner/"
220+
cp /tmp/new/results/asvrunner/${SHA}.json "$WORK/results/asvrunner/"
221+
cp /tmp/new/envs/${SHA}.yml "$WORK/envs/"
222+
223+
tar -C "$WORK" -I 'zstd -19' -cf data/results.tar.zst results
224+
tar -C "$WORK" -I 'zstd -19' -cf data/envs.tar.zst envs
225+
226+
EXPECTED=$(git rev-parse origin/${BRANCH_NAME})
227+
git checkout --orphan fresh
228+
git add -A
229+
git commit -m "Results"
230+
git branch -M fresh ${BRANCH_NAME}
231+
232+
if git push --force-with-lease=${BRANCH_NAME}:${EXPECTED} origin ${BRANCH_NAME}; then
233+
echo "Push succeeded on attempt $attempt"
234+
cd ..
235+
exit 0
236+
fi
237+
238+
cd ..
239+
echo "Merge race lost on attempt $attempt; retrying"
240+
sleep $((attempt * 5 + RANDOM % 10))
241+
done
242+
243+
echo "Failed to merge after 5 attempts" >&2
244+
exit 1

data/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
results/
2+
envs/

0 commit comments

Comments
 (0)