Skip to content

Commit eedfdf1

Browse files
committed
ci: Split up network fetches and retry with Justfile targets
Our CI is long and flakes are painful, especially with merge queues. Let's take a heavy hammer and refactor all network-fetched resources into a clearly separate stage, and then we have a fully enforced retry loop for those that has even longer timeouts in CI. Assisted-by: OpenCode (claude-sonnet-4-6@default) Signed-off-by: Colin Walters <walters@verbum.org>
1 parent 961141a commit eedfdf1

7 files changed

Lines changed: 366 additions & 235 deletions

File tree

.github/workflows/ci.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ env:
2727
# as the default runner user doesn't have access
2828
LIBVIRT_DEFAULT_URI: "qemu:///session"
2929
DEV_IMAGE: ghcr.io/bootc-dev/dev-bootc
30+
# Retry parameters for `just build-fetch` (transient Koji/Copr/quay.io failures)
31+
BOOTC_CI_RETRIES: "10"
32+
BOOTC_CI_DELAY: "60"
3033

3134
concurrency:
3235
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -108,6 +111,9 @@ jobs:
108111
run: sudo tune2fs -O verity $(findmnt -vno SOURCE /)
109112
- name: Install utils
110113
run: sudo apt -y install fsverity just
114+
- name: Fetch external dependencies (with retry)
115+
run: just build-fetch
116+
111117
- name: Integration tests
112118
run: |
113119
set -xeu
@@ -271,6 +277,9 @@ jobs:
271277
name: packages-${{ matrix.test_os }}
272278
path: target/packages/
273279

280+
- name: Fetch external dependencies (with retry)
281+
run: BOOTC_SKIP_PACKAGE=1 just build-fetch
282+
274283
- name: Build container
275284
run: |
276285
BOOTC_SKIP_PACKAGE=1 just bootloader=$BOOTC_bootloader build
@@ -355,6 +364,9 @@ jobs:
355364
name: packages-${{ matrix.test_os }}
356365
path: target/packages/
357366

367+
- name: Fetch external dependencies (with retry)
368+
run: just build-fetch
369+
358370
- name: Run upgrade test
359371
run: just test-upgrade
360372

Dockerfile

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,18 +53,20 @@ RUN --mount=type=tmpfs,target=/run --mount=type=tmpfs,target=/tmp \
5353
/run/packaging/enable-compose-repos
5454
RUN --mount=type=tmpfs,target=/run --mount=type=tmpfs,target=/tmp /usr/libexec/bootc-base-imagectl build-rootfs --manifest=standard /target-rootfs
5555

56-
FROM scratch as base
56+
FROM scratch as fetch
5757
COPY --from=target-base /target-rootfs/ /
5858
# SKIP_CONFIGS=1 skips LBIs, test kargs, and install configs (for FCOS testing)
5959
ARG SKIP_CONFIGS
6060
ARG boot_type
6161
ARG seal_state
62-
# Use tmpfs for /run and /tmp with bind mounts inside to avoid leaking mount stubs into the image
62+
# All network-fetching operations: package installs from distro repos, Copr, Koji.
63+
# Separated so `just build-fetch --target=fetch` can be retried independently on
64+
# transient network failures without re-running the configuration phase.
6365
RUN --mount=type=tmpfs,target=/run --mount=type=tmpfs,target=/tmp \
6466
--mount=type=bind,from=src,src=/src/hack,target=/run/hack <<-EOF
6567
set -ex
6668

67-
cd /run/hack/ && SKIP_CONFIGS="${SKIP_CONFIGS}" ./provision-derived.sh
69+
cd /run/hack/ && SKIP_CONFIGS="${SKIP_CONFIGS}" ./provision-fetch.sh
6870

6971
pkgs_to_install=()
7072
if [[ "${seal_state}" == "sealed" ]]; then
@@ -106,7 +108,7 @@ CMD ["/sbin/init"]
106108

107109
# This layer contains things which aren't in the default image and may
108110
# be used for sealing images in particular.
109-
FROM base as tools
111+
FROM fetch as tools
110112
RUN --mount=type=tmpfs,target=/run --mount=type=tmpfs,target=/tmp \
111113
--mount=type=bind,from=packaging,src=/,target=/run/packaging \
112114
/run/packaging/initialize-sealing-tools
@@ -118,6 +120,14 @@ RUN --mount=type=tmpfs,target=/run --mount=type=tmpfs,target=/tmp \
118120
# This is verified in `cargo xtask check-buildsys`.
119121
# -------------
120122

123+
FROM fetch as base
124+
ARG SKIP_CONFIGS
125+
# Local configuration only — no network access required or permitted.
126+
# Sits after the cutoff so the linter enforces --network=none automatically.
127+
RUN --network=none --mount=type=tmpfs,target=/run --mount=type=tmpfs,target=/tmp \
128+
--mount=type=bind,from=src,src=/src/hack,target=/run/hack \
129+
sh -c 'cd /run/hack/ && SKIP_CONFIGS="${SKIP_CONFIGS}" ./provision-configure.sh'
130+
121131
FROM buildroot as build
122132
# Version for RPM build (optional, computed from git in Justfile)
123133
ARG pkgversion

Justfile

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,56 @@ build: package _keygen && _pull-lbi-images
8686
eval $(just _git-build-vars)
8787
podman build {{_nocache_arg}} --build-arg=image_version=${VERSION} --build-context "packages=${pkg_path}" -t {{base_img}} {{buildargs}} .
8888

89+
# Fetch all external dependencies with a retry loop.
90+
#
91+
# This runs `podman build --target=fetch` for both the main image and the
92+
# upgrade-source image, retrying on transient network failures (Koji 503s,
93+
# Copr outages, quay.io blips, etc.). In CI this runs as its own step
94+
# before `just build` / `just test-upgrade` so that flakes don't require
95+
# re-queueing the entire PR.
96+
#
97+
# The retry parameters can be overridden via environment variables:
98+
# BOOTC_CI_RETRIES=10 BOOTC_CI_DELAY=60 just build-fetch
99+
[group('core')]
100+
build-fetch: _keygen
101+
#!/bin/bash
102+
set -euo pipefail
103+
retries=${BOOTC_CI_RETRIES:-3}
104+
delay=${BOOTC_CI_DELAY:-30}
105+
retry() {
106+
local attempt
107+
for attempt in $(seq 1 "$retries"); do
108+
echo "--- Attempt ${attempt}/${retries}: $*"
109+
if "$@"; then
110+
return 0
111+
fi
112+
if [ "$attempt" -lt "$retries" ]; then
113+
echo "--- Attempt ${attempt} failed, retrying in ${delay}s..."
114+
sleep "$delay"
115+
fi
116+
done
117+
echo "--- All ${retries} attempts failed: $*" >&2
118+
return 1
119+
}
120+
# Pull the base images explicitly so failures are retried cleanly
121+
# before we even start the container build.
122+
retry podman pull -q {{base}}
123+
retry podman pull -q {{buildroot_base}}
124+
# Pull LBI images (also fetched later by _pull-lbi-images, but doing it
125+
# here means a failure is retried rather than aborting the full build).
126+
for img in {{lbi_images}}; do
127+
retry podman pull -q "$img"
128+
done
129+
# Build the network-heavy fetch stage of the main image. If this
130+
# succeeds, `just build` will get a cache hit on the fetch layer and
131+
# run entirely offline.
132+
# Note: buildargs (not base_buildargs) is needed here because the
133+
# target-base stage requires --cap-add/--security-opt for bwrap.
134+
retry podman build {{_nocache_arg}} --target=fetch {{buildargs}} .
135+
# Same for the upgrade-source image used by test-upgrade.
136+
retry podman build {{_nocache_arg}} --build-arg=base={{base}} \
137+
--target=fetch -f tmt/tests/Dockerfile.upgrade-source .
138+
89139
# Show available build variants and current configuration
90140
[group('core')]
91141
list-variants:

hack/provision-configure.sh

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
#!/bin/bash
2+
# All local filesystem configuration for a derived test image.
3+
# No network access required; this runs after provision-fetch.sh has
4+
# installed all packages. See also: Dockerfile fetch/base stage split.
5+
set -xeu
6+
7+
cloudinit=0
8+
case ${1:-} in
9+
cloudinit) cloudinit=1 ;;
10+
"") ;;
11+
*) echo "Unhandled flag: ${1:-}" 1>&2; exit 1 ;;
12+
esac
13+
14+
# Clean root's homedir (provision-fetch.sh may have left cargo/dnf state).
15+
rm -rf /var/roothome/.config
16+
mkdir -p -m 0700 /var/roothome
17+
18+
# Nushell config for root: store the files under /usr so they are covered by
19+
# the OS image, then use tmpfiles.d 'C' to copy them into /var/roothome at
20+
# first boot. Writing directly to /var would require tmpfiles entries anyway
21+
# and would fail `bootc container lint --fatal-warnings`.
22+
mkdir -p /usr/share/bootc-test/nushell-skel
23+
echo '$env.config = { show_banner: false, }' > /usr/share/bootc-test/nushell-skel/config.nu
24+
touch /usr/share/bootc-test/nushell-skel/env.nu
25+
cat >/usr/lib/tmpfiles.d/bootc-test-nushell.conf <<'EOF'
26+
d /var/roothome/.config 0700 root root - -
27+
d /var/roothome/.config/nushell 0700 root root - -
28+
C+ /var/roothome/.config/nushell/config.nu 0600 root root - /usr/share/bootc-test/nushell-skel/config.nu
29+
C+ /var/roothome/.config/nushell/env.nu 0600 root root - /usr/share/bootc-test/nushell-skel/env.nu
30+
EOF
31+
32+
# kargs for serial console
33+
cat <<KARGEOF >> /usr/lib/bootc/kargs.d/20-console.toml
34+
kargs = ["console=ttyS0,115200n8"]
35+
KARGEOF
36+
37+
if test $cloudinit = 1; then
38+
ln -s ../cloud-init.target /usr/lib/systemd/system/default.target.wants
39+
# Allow root SSH login for testing with bcvk/tmt
40+
mkdir -p /etc/cloud/cloud.cfg.d
41+
cat > /etc/cloud/cloud.cfg.d/80-enable-root.cfg <<'CLOUDEOF'
42+
# Enable root login for testing
43+
disable_root: false
44+
45+
# In image mode, the host root filesystem is mounted at /sysroot, not /
46+
# That is the one we should attempt to resize, not what is mounted at /
47+
growpart:
48+
mode: auto
49+
devices: ["/sysroot"]
50+
resize_rootfs: false
51+
CLOUDEOF
52+
fi
53+
54+
cat >/usr/lib/tmpfiles.d/bootc-cloud-init.conf <<'EOF'
55+
d /var/lib/cloud 0755 root root - -
56+
EOF
57+
58+
# Fast track tmpfiles.d content from the base image, xref
59+
# https://gitlab.com/fedora/bootc/base-images/-/merge_requests/92
60+
if test '!' -f /usr/lib/tmpfiles.d/bootc-base-rpmstate.conf; then
61+
cat >/usr/lib/tmpfiles.d/bootc-base-rpmstate.conf <<'EOF'
62+
# Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=771713
63+
d /var/lib/rpm-state 0755 - - -
64+
EOF
65+
fi
66+
if ! grep -q -r var/roothome/buildinfo /usr/lib/tmpfiles.d; then
67+
cat > /usr/lib/tmpfiles.d/bootc-contentsets.conf <<'EOF'
68+
# Workaround for https://github.com/konflux-ci/build-tasks-dockerfiles/pull/243
69+
d /var/roothome/buildinfo 0755 - - -
70+
d /var/roothome/buildinfo/content_manifests 0755 - - -
71+
# Note we don't actually try to recreate the content; this just makes the linter ignore it
72+
f /var/roothome/buildinfo/content_manifests/content-sets.json 0644 - - -
73+
EOF
74+
fi
75+
76+
# And add missing sysusers.d entries
77+
if ! grep -q -r sudo /usr/lib/sysusers.d; then
78+
cat >/usr/lib/sysusers.d/bootc-sudo-workaround.conf <<'EOF'
79+
g sudo 16
80+
EOF
81+
fi
82+
83+
# dhcpcd
84+
if rpm -q dhcpcd &>/dev/null; then
85+
if ! grep -q -r dhcpcd /usr/lib/sysusers.d; then
86+
cat >/usr/lib/sysusers.d/bootc-dhcpcd-workaround.conf <<'EOF'
87+
u dhcpcd - 'Minimalistic DHCP client' /var/lib/dhcpcd
88+
EOF
89+
fi
90+
cat >/usr/lib/tmpfiles.d/bootc-dhcpd.conf <<'EOF'
91+
d /var/lib/dhcpcd 0755 root dhcpcd - -
92+
EOF
93+
rm -rf /var/lib/dhcpcd
94+
fi
95+
# dhclient
96+
if test -d /var/lib/dhclient; then
97+
cat >/usr/lib/tmpfiles.d/bootc-dhclient.conf <<'EOF'
98+
d /var/lib/dhclient 0755 root root - -
99+
EOF
100+
rm -rf /var/lib/dhclient
101+
fi
102+
103+
# The following configs are skipped when SKIP_CONFIGS=1, which is used
104+
# for testing bootc install on Fedora CoreOS where these would conflict.
105+
if test -z "${SKIP_CONFIGS:-}"; then
106+
# For test-22-logically-bound-install
107+
install -D -m 0644 -t /usr/share/containers/systemd/ lbi/*
108+
for x in curl.container curl-base.image podman.image; do
109+
ln -s /usr/share/containers/systemd/$x /usr/lib/bootc/bound-images.d/$x
110+
done
111+
112+
# Add some testing kargs into our dev builds
113+
install -D -t /usr/lib/bootc/kargs.d test-kargs/*
114+
# Also copy in some default install configs we use for testing
115+
install -D -t /usr/lib/bootc/install/ install-test-configs/*
116+
117+
# Install os-image-map.json for tests that need to select OS-matched images
118+
install -D -m 0644 os-image-map.json /usr/share/bootc/os-image-map.json
119+
else
120+
echo "SKIP_CONFIGS is set, skipping LBIs, test kargs, and install configs"
121+
fi

0 commit comments

Comments
 (0)