Skip to content

Commit 0a24d75

Browse files
committed
ci: shard alpine-test into parallel jobs to reduce CI time
The alpine-test CI job runs all ~483 zdtm tests sequentially three times (normal, mntns-compat-mode, criu-config), followed by many non-shardable tests. This dominates overall CI wait time. With only 2 jobs running in parallel (GCC and CLANG) the alpine tests take around 30 minutes. Use the existing --test-shard-index and --test-shard-count flags already built into test/zdtm.py to split the zdtm test suite across four parallel runners (shards 0-3). A fifth shard runs all non-shardable tests (lazy pages, fault injection, test/others/*, rootless, compel, plugins, etc.) independently and in parallel with the zdtm shards. This increases parallelism from 2 to 10 jobs and reduces the alpine test wall-clock time from ~30 to ~10 minutes. Changes: - run-ci-tests.sh: Build SHARD_OPTS from ZDTM_SHARD_INDEX/COUNT env vars and pass them to zdtm.py. Extract all non-shardable tests into a run_non_shardable_tests() function. Dispatch based on shard index: 0-3 run zdtm slices, 4 runs non-shardable tests, unset runs everything sequentially (preserving existing behavior). Validate that ZDTM_SHARD_INDEX is set when ZDTM_SHARD_COUNT is set. - Makefile: Pass ZDTM_SHARD_INDEX and ZDTM_SHARD_COUNT into the container when set. Split long container run command across multiple lines for readability. - ci.yml: Add shard: [0, 1, 2, 3, 4] to the alpine-test matrix, producing 10 jobs (2 compilers x 5 shards). Job labels now show descriptive shard names (e.g. "zdtm 1/4", "non-zdtm") instead of raw indices. When sharding is not configured the script behaves identically to before, so other CI jobs (aarch64, compat, gcov, etc.) are unaffected. Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Adrian Reber <areber@redhat.com>
1 parent 7539f39 commit 0a24d75

3 files changed

Lines changed: 200 additions & 132 deletions

File tree

.github/workflows/ci.yml

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,31 @@ concurrency:
99

1010
jobs:
1111
alpine-test:
12-
name: Alpine Test
12+
name: Alpine Test (${{ matrix.target }}, ${{ matrix.shard_name }})
1313
strategy:
1414
matrix:
1515
os: [ubuntu-22.04]
1616
target: [GCC=1, CLANG=1]
17+
shard: [0, 1, 2, 3, 4]
18+
include:
19+
- shard: 0
20+
shard_name: zdtm 1/4
21+
- shard: 1
22+
shard_name: zdtm 2/4
23+
- shard: 2
24+
shard_name: zdtm 3/4
25+
- shard: 3
26+
shard_name: zdtm 4/4
27+
- shard: 4
28+
shard_name: non-zdtm
1729
runs-on: ${{ matrix.os }}
1830
steps:
1931
- uses: actions/checkout@v4
20-
- name: Run Alpine ${{ matrix.target }} Test
21-
run: sudo -E make -C scripts/ci alpine ${{ matrix.target }}
32+
- name: Run Alpine ${{ matrix.target }} ${{ matrix.shard_name }} Test
33+
run: >
34+
sudo -E make -C scripts/ci alpine ${{ matrix.target }}
35+
ZDTM_SHARD_INDEX=${{ matrix.shard }}
36+
ZDTM_SHARD_COUNT=4
2237
2338
alpine-test-arm64:
2439
name: Alpine Test ARM64

scripts/ci/Makefile

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,19 @@ ifeq ($(CONTAINER_RUNTIME),podman)
4545
endif
4646

4747
export ZDTM_OPTS
48+
export ZDTM_SHARD_INDEX
49+
export ZDTM_SHARD_COUNT
4850

4951
$(TARGETS):
5052
$(MAKE) -C ../build $@$(target-suffix)
51-
$(CONTAINER_RUNTIME) run --env-file docker.env -v `pwd`/../../:/criu $(if $(ZDTM_OPTS),-e ZDTM_OPTS) $(CONTAINER_OPTS) criu-$@ scripts/ci/run-ci-tests.sh
53+
$(CONTAINER_RUNTIME) run \
54+
--env-file docker.env \
55+
-v `pwd`/../../:/criu \
56+
$(if $(ZDTM_OPTS),-e ZDTM_OPTS) \
57+
$(if $(ZDTM_SHARD_INDEX),-e ZDTM_SHARD_INDEX) \
58+
$(if $(ZDTM_SHARD_COUNT),-e ZDTM_SHARD_COUNT) \
59+
$(CONTAINER_OPTS) \
60+
criu-$@ scripts/ci/run-ci-tests.sh
5261

5362
fedora-asan:
5463
$(MAKE) -C ../build $@$(target-suffix)

scripts/ci/run-ci-tests.sh

Lines changed: 172 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,16 @@ X86_64_PKGS=(gcc-multilib)
88
# Convert from string to array.
99
IFS=" " read -r -a ZDTM_OPTS <<< "$ZDTM_OPTS"
1010

11+
SHARD_OPTS=()
12+
if [ -n "$ZDTM_SHARD_COUNT" ] && [ "$ZDTM_SHARD_COUNT" -gt 0 ]; then
13+
if [ -z "$ZDTM_SHARD_INDEX" ]; then
14+
echo "ERROR: ZDTM_SHARD_COUNT set but ZDTM_SHARD_INDEX is not"
15+
exit 1
16+
fi
17+
SHARD_OPTS=(--test-shard-index "$ZDTM_SHARD_INDEX" \
18+
--test-shard-count "$ZDTM_SHARD_COUNT")
19+
fi
20+
1121
UNAME_M=$(uname -m)
1222

1323
if [ "$UNAME_M" != "x86_64" ]; then
@@ -220,155 +230,189 @@ if [ "${STREAM_TEST}" = "1" ]; then
220230
exit 0
221231
fi
222232

223-
./test/zdtm.py run -a -p 2 --keep-going "${ZDTM_OPTS[@]}"
224-
if criu/criu check --feature move_mount_set_group; then
225-
./test/zdtm.py run -a -p 2 --mntns-compat-mode --keep-going "${ZDTM_OPTS[@]}"
226-
fi
233+
run_non_shardable_tests() {
234+
# Newer kernels are blocking access to userfaultfd:
235+
# uffd: Set unprivileged_userfaultfd sysctl knob to 1 if kernel faults
236+
# must be handled without obtaining CAP_SYS_PTRACE capability
237+
if [ -e /proc/sys/vm/unprivileged_userfaultfd ]; then
238+
echo 1 > /proc/sys/vm/unprivileged_userfaultfd
239+
fi
227240

228-
./test/zdtm.py run -a -p 2 --keep-going --criu-config "${ZDTM_OPTS[@]}"
241+
LAZY_EXCLUDE=(-x maps04 -x cmdlinenv00 -x maps007)
229242

230-
# Newer kernels are blocking access to userfaultfd:
231-
# uffd: Set unprivileged_userfaultfd sysctl knob to 1 if kernel faults must be handled without obtaining CAP_SYS_PTRACE capability
232-
if [ -e /proc/sys/vm/unprivileged_userfaultfd ]; then
233-
echo 1 > /proc/sys/vm/unprivileged_userfaultfd
234-
fi
243+
LAZY_TESTS='.*(maps0|uffd-events|lazy-thp|futex|fork).*'
244+
LAZY_OPTS=(-p 2 -T "$LAZY_TESTS" "${LAZY_EXCLUDE[@]}" "${ZDTM_OPTS[@]}")
235245

236-
LAZY_EXCLUDE=(-x maps04 -x cmdlinenv00 -x maps007)
246+
./test/zdtm.py run "${LAZY_OPTS[@]}" --lazy-pages
247+
./test/zdtm.py run "${LAZY_OPTS[@]}" --remote-lazy-pages
248+
./test/zdtm.py run "${LAZY_OPTS[@]}" --remote-lazy-pages --tls
237249

238-
LAZY_TESTS='.*(maps0|uffd-events|lazy-thp|futex|fork).*'
239-
LAZY_OPTS=(-p 2 -T "$LAZY_TESTS" "${LAZY_EXCLUDE[@]}" "${ZDTM_OPTS[@]}")
250+
bash -x ./test/jenkins/criu-fault.sh
251+
if [ "$UNAME_M" == "x86_64" ]; then
252+
# This fails on aarch64 (aws-graviton2) with:
253+
# 33: ERR: thread-bomb.c:49: pthread_attr_setstacksize(): 22
254+
bash -x ./test/jenkins/criu-fcg.sh
255+
fi
256+
bash -x ./test/jenkins/criu-inhfd.sh
240257

241-
./test/zdtm.py run "${LAZY_OPTS[@]}" --lazy-pages
242-
./test/zdtm.py run "${LAZY_OPTS[@]}" --remote-lazy-pages
243-
./test/zdtm.py run "${LAZY_OPTS[@]}" --remote-lazy-pages --tls
258+
if [ -z "$SKIP_EXT_DEV_TEST" ]; then
259+
make -C test/others/mnt-ext-dev/ run
260+
if criu/criu check --feature move_mount_set_group; then
261+
EXTRA_OPTS=--mntns-compat-mode make -C test/others/mnt-ext-dev/ run
262+
fi
263+
fi
244264

245-
bash -x ./test/jenkins/criu-fault.sh
246-
if [ "$UNAME_M" == "x86_64" ]; then
247-
# This fails on aarch64 (aws-graviton2) with:
248-
# 33: ERR: thread-bomb.c:49: pthread_attr_setstacksize(): 22
249-
bash -x ./test/jenkins/criu-fcg.sh
250-
fi
251-
bash -x ./test/jenkins/criu-inhfd.sh
265+
make -C test/others/make/ run CC="$CC"
266+
if [ -n "$CIRCLECI" ]; then
267+
# GitHub Actions (and Cirrus CI) does not provide a real TTY
268+
# and CRIU will fail with:
269+
# Error (criu/tty.c:1014): tty: Don't have tty to inherit
270+
# session from, aborting
271+
make -C test/others/shell-job/ run
272+
fi
273+
make -C test/others/criu-ns/ run
274+
make -C test/others/skip-file-rwx-check/ run
275+
make -C test/others/rpc/ run
276+
277+
./test/zdtm.py run -t zdtm/static/env00 --sibling
278+
279+
./test/zdtm.py run -t zdtm/static/maps00 --preload-libfault
280+
./test/zdtm.py run -t zdtm/static/maps02 --preload-libfault
281+
282+
./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --dedup
283+
./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --noauto-dedup
284+
./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --page-server
285+
./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --page-server --dedup
286+
./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --pre-dump-mode read
287+
288+
./test/zdtm.py run -t zdtm/transition/pid_reuse --pre 2 # start time based pid reuse detection
289+
./test/zdtm.py run -t zdtm/transition/pidfd_store_sk --rpc --pre 2 # pidfd based pid reuse detection
290+
291+
./test/zdtm.py run -t zdtm/static/socket-tcp-local --norst
292+
293+
ip net add test
294+
./test/zdtm.py run -t zdtm/static/env00 -f h --join-ns
295+
296+
# RPC testing
297+
./test/zdtm.py run -t zdtm/static/env00 --rpc # Basic
298+
./test/zdtm.py run -t zdtm/static/env00 --rpc --pre 2 --page-server
299+
./test/zdtm.py run -t zdtm/static/ptrace_sig -f h --rpc # Error handling (crfail test)
300+
301+
./test/zdtm.py run --empty-ns -T zdtm/static/socket-tcp*-local --iter 2
302+
303+
./test/zdtm.py run -t zdtm/static/env00 -t zdtm/transition/fork -t zdtm/static/ghost_holes00 -t zdtm/static/socket-tcp -t zdtm/static/msgque -k always
304+
./test/crit-recode.py
305+
306+
# Rootless tests
307+
# Check if cap_checkpoint_restore is supported and also if unshare -c
308+
# is supported.
309+
#
310+
# Do not run this test in a container
311+
# (see https://github.com/checkpoint-restore/criu/issues/2312).
312+
# Before v6.8-rc1~215^2~6, the kernel currently did not show correct
313+
# device and inode numbers in /proc/pid/maps for stackable file
314+
# systems.
315+
skip=0
316+
findmnt -no FSTYPE / | grep overlay && {
317+
./criu/criu check --feature overlayfs_maps || skip=1
318+
}
319+
unshare -c /bin/true || skip=1
320+
capsh --supports=cap_checkpoint_restore || skip=1
321+
322+
if [ "$skip" == 0 ]; then
323+
make -C test/zdtm/ cleanout
324+
rm -rf test/dump
325+
setcap cap_checkpoint_restore,cap_sys_ptrace+eip criu/criu
326+
if [ -d /sys/fs/selinux ] && command -v getenforce &>/dev/null; then
327+
# Note: selinux in Enforcing mode prevents us from
328+
# calling clone3() or writing to ns_last_pid on
329+
# restore; hence set to Permissive for the test and
330+
# then set back.
331+
selinuxmode=$(getenforce)
332+
if [ "$selinuxmode" != "Disabled" ]; then
333+
setenforce Permissive
334+
fi
252335

253-
if [ -z "$SKIP_EXT_DEV_TEST" ]; then
254-
make -C test/others/mnt-ext-dev/ run
255-
if criu/criu check --feature move_mount_set_group; then
256-
EXTRA_OPTS=--mntns-compat-mode make -C test/others/mnt-ext-dev/ run
336+
fi
337+
# Run it as non-root in a user namespace. Since
338+
# CAP_CHECKPOINT_RESTORE behaves differently in non-user
339+
# namespaces (e.g. no access to map_files) this tests that we
340+
# can dump and restore under those conditions. Note that the
341+
# "... && true" part is necessary; we need at least one
342+
# statement after the tests so that bash can reap zombies in
343+
# the user namespace, otherwise it will exec the last statement
344+
# and get replaced and nobody will be left to reap our zombies.
345+
sudo --user=#65534 --group=#65534 unshare -Ucfpm --mount-proc -- bash -c "./test/zdtm.py run -t zdtm/static/maps00 -f h --rootless && true"
346+
if [ -d /sys/fs/selinux ] && command -v getenforce &>/dev/null; then
347+
if [ "$selinuxmode" != "Disabled" ]; then
348+
setenforce "$selinuxmode"
349+
fi
350+
fi
351+
setcap -r criu/criu
352+
else
353+
echo "Skipping unprivileged mode tests"
257354
fi
258-
fi
259355

260-
make -C test/others/make/ run CC="$CC"
261-
if [ -n "$CIRCLECI" ]; then
262-
# GitHub Actions (and Cirrus CI) does not provide a real TTY and CRIU will fail with:
263-
# Error (criu/tty.c:1014): tty: Don't have tty to inherit session from, aborting
264-
make -C test/others/shell-job/ run
265-
fi
266-
make -C test/others/criu-ns/ run
267-
make -C test/others/skip-file-rwx-check/ run
268-
make -C test/others/rpc/ run
356+
# more crit testing
357+
make -C test/others/crit run
269358

270-
./test/zdtm.py run -t zdtm/static/env00 --sibling
359+
# coredump testing
360+
make -C test/others/criu-coredump run
271361

272-
./test/zdtm.py run -t zdtm/static/maps00 --preload-libfault
273-
./test/zdtm.py run -t zdtm/static/maps02 --preload-libfault
362+
# libcriu testing
363+
make -C test/others/libcriu run
274364

275-
./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --dedup
276-
./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --noauto-dedup
277-
./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --page-server
278-
./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --page-server --dedup
279-
./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --pre-dump-mode read
365+
# external namespace testing
366+
make -C test/others/ns_ext run
280367

281-
./test/zdtm.py run -t zdtm/transition/pid_reuse --pre 2 # start time based pid reuse detection
282-
./test/zdtm.py run -t zdtm/transition/pidfd_store_sk --rpc --pre 2 # pidfd based pid reuse detection
368+
# config file parser and parameter testing
369+
make -C test/others/config-file run
283370

284-
./test/zdtm.py run -t zdtm/static/socket-tcp-local --norst
371+
# action script testing
372+
make -C test/others/action-script run
285373

286-
ip net add test
287-
./test/zdtm.py run -t zdtm/static/env00 -f h --join-ns
374+
# Skip all further tests when running with GCOV=1
375+
# The one test which currently cannot handle GCOV testing is
376+
# compel/test. Probably because the GCOV Makefile infrastructure
377+
# does not exist in compel.
378+
[ -n "$GCOV" ] && return 0
288379

289-
# RPC testing
290-
./test/zdtm.py run -t zdtm/static/env00 --rpc # Basic
291-
./test/zdtm.py run -t zdtm/static/env00 --rpc --pre 2 --page-server
292-
./test/zdtm.py run -t zdtm/static/ptrace_sig -f h --rpc # Error handling (crfail test)
380+
# compel testing
381+
make -C compel/test
293382

294-
./test/zdtm.py run --empty-ns -T zdtm/static/socket-tcp*-local --iter 2
383+
# amdgpu and cuda plugin testing
384+
make amdgpu_plugin
385+
make -C plugins/amdgpu/ test_topology_remap
386+
./plugins/amdgpu/test_topology_remap
295387

296-
./test/zdtm.py run -t zdtm/static/env00 -t zdtm/transition/fork -t zdtm/static/ghost_holes00 -t zdtm/static/socket-tcp -t zdtm/static/msgque -k always
297-
./test/crit-recode.py
388+
./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin cuda
389+
./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu
390+
./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu cuda
391+
./test/zdtm.py run -t zdtm/static/busyloop00 --criu-plugin inventory_test_enabled inventory_test_disabled
298392

299-
# Rootless tests
300-
# Check if cap_checkpoint_restore is supported and also if unshare -c is supported.
301-
#
302-
# Do not run this test in a container (see https://github.com/checkpoint-restore/criu/issues/2312).
303-
# Before v6.8-rc1~215^2~6, the kernel currently did not show correct device and
304-
# inode numbers in /proc/pid/maps for stackable file systems.
305-
skip=0
306-
findmnt -no FSTYPE / | grep overlay && {
307-
./criu/criu check --feature overlayfs_maps || skip=1
393+
./test/zdtm.py run -t zdtm/static/sigpending -t zdtm/static/pthread00 --mocked-cuda-checkpoint --fault 138
308394
}
309-
unshare -c /bin/true || skip=1
310-
capsh --supports=cap_checkpoint_restore || skip=1
311-
312-
if [ "$skip" == 0 ]; then
313-
make -C test/zdtm/ cleanout
314-
rm -rf test/dump
315-
setcap cap_checkpoint_restore,cap_sys_ptrace+eip criu/criu
316-
if [ -d /sys/fs/selinux ] && command -v getenforce &>/dev/null; then
317-
# Note: selinux in Enforcing mode prevents us from calling clone3() or writing to ns_last_pid on restore; hence set to Permissive for the test and then set back.
318-
selinuxmode=$(getenforce)
319-
if [ "$selinuxmode" != "Disabled" ]; then
320-
setenforce Permissive
321-
fi
322395

396+
# When sharding is enabled, shards 0..count-1 run sharded zdtm tests and
397+
# shard "count" (the extra shard) runs only the non-shardable tests.
398+
# When sharding is not enabled, run everything sequentially.
399+
if [ -z "$ZDTM_SHARD_COUNT" ] || [ "$ZDTM_SHARD_COUNT" -eq 0 ]; then
400+
# No sharding: run all zdtm tests followed by non-shardable tests
401+
./test/zdtm.py run -a -p 2 --keep-going "${ZDTM_OPTS[@]}"
402+
if criu/criu check --feature move_mount_set_group; then
403+
./test/zdtm.py run -a -p 2 --mntns-compat-mode --keep-going "${ZDTM_OPTS[@]}"
323404
fi
324-
# Run it as non-root in a user namespace. Since CAP_CHECKPOINT_RESTORE behaves differently in non-user namespaces (e.g. no access to map_files) this tests that we can dump and restore
325-
# under those conditions. Note that the "... && true" part is necessary; we need at least one statement after the tests so that bash can reap zombies in the user namespace,
326-
# otherwise it will exec the last statement and get replaced and nobody will be left to reap our zombies.
327-
sudo --user=#65534 --group=#65534 unshare -Ucfpm --mount-proc -- bash -c "./test/zdtm.py run -t zdtm/static/maps00 -f h --rootless && true"
328-
if [ -d /sys/fs/selinux ] && command -v getenforce &>/dev/null; then
329-
if [ "$selinuxmode" != "Disabled" ]; then
330-
setenforce "$selinuxmode"
331-
fi
332-
fi
333-
setcap -r criu/criu
405+
./test/zdtm.py run -a -p 2 --keep-going --criu-config "${ZDTM_OPTS[@]}"
406+
run_non_shardable_tests
407+
elif [ "$ZDTM_SHARD_INDEX" -eq "$ZDTM_SHARD_COUNT" ]; then
408+
# This is the extra non-shardable shard (index == count, e.g. shard 4
409+
# when count is 4). Only run non-shardable tests, skip zdtm shards.
410+
run_non_shardable_tests
334411
else
335-
echo "Skipping unprivileged mode tests"
412+
# Shards 0..count-1: run only the sharded zdtm tests
413+
./test/zdtm.py run -a -p 2 --keep-going "${SHARD_OPTS[@]}" "${ZDTM_OPTS[@]}"
414+
if criu/criu check --feature move_mount_set_group; then
415+
./test/zdtm.py run -a -p 2 --mntns-compat-mode --keep-going "${SHARD_OPTS[@]}" "${ZDTM_OPTS[@]}"
416+
fi
417+
./test/zdtm.py run -a -p 2 --keep-going --criu-config "${SHARD_OPTS[@]}" "${ZDTM_OPTS[@]}"
336418
fi
337-
338-
# more crit testing
339-
make -C test/others/crit run
340-
341-
# coredump testing
342-
make -C test/others/criu-coredump run
343-
344-
# libcriu testing
345-
make -C test/others/libcriu run
346-
347-
# external namespace testing
348-
make -C test/others/ns_ext run
349-
350-
# config file parser and parameter testing
351-
make -C test/others/config-file run
352-
353-
# action script testing
354-
make -C test/others/action-script run
355-
356-
# Skip all further tests when running with GCOV=1
357-
# The one test which currently cannot handle GCOV testing is compel/test
358-
# Probably because the GCOV Makefile infrastructure does not exist in compel
359-
[ -n "$GCOV" ] && exit 0
360-
361-
# compel testing
362-
make -C compel/test
363-
364-
# amdgpu and cuda plugin testing
365-
make amdgpu_plugin
366-
make -C plugins/amdgpu/ test_topology_remap
367-
./plugins/amdgpu/test_topology_remap
368-
369-
./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin cuda
370-
./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu
371-
./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu cuda
372-
./test/zdtm.py run -t zdtm/static/busyloop00 --criu-plugin inventory_test_enabled inventory_test_disabled
373-
374-
./test/zdtm.py run -t zdtm/static/sigpending -t zdtm/static/pthread00 --mocked-cuda-checkpoint --fault 138

0 commit comments

Comments
 (0)