Skip to content

Commit c417781

Browse files
perf(scripts): speed up RHDH operator install using install-rhdh-catalog-source.sh (~30 min → ~2 min) (#2870)
* perf(scripts): parallelize IIB bundle processing (~27 min → ~5 min) - Skips slow `skopeo inspect` (~42s/bundle) — attempts the copy directly instead; failed copies (~3s) are faster than successful inspects - Processes bundles in parallel up to MAX_PARALLEL (default 10), with a portable kill-0 throttle loop that prunes finished PIDs each iteration - Collects per-worker sed files and applies them in one pass after all bundles complete, avoiding concurrent writes to render.yaml - Runs `opm render` and cluster registry setup in parallel since they are independent; waits before the bundle-processing phase begins - Replaces check-then-delete secret pattern with --ignore-not-found - Deletes existing CatalogSource before recreating to force OLM re-index when the tag is unchanged but the digest has changed (rebuilt IIB) - Fails loudly if any bundle fails to process (was: silent error log) Assisted-by: Claude Code Co-Authored-By: Claude Code <noreply@anthropic.com> * fix(scripts): add set -euo pipefail to process_bundle and preserve skopeo stderr Background subshells don't inherit set -e from the parent, so intermediate failures (umoci, skopeo push) went undetected and the worker would write a .sed entry for a broken bundle. Also redirect speculative copy stderr to a per-bundle file instead of /dev/null so auth failures, timeouts, and disk errors are debuggable. Assisted-by: Claude Code Co-Authored-By: Claude Code <noreply@anthropic.com> * chore(scripts): clean up comments to focus on WHY, not WHAT Assisted-by: Claude Code Co-Authored-By: Claude Code <noreply@anthropic.com> * chore(scripts): clarify MAX_PARALLEL comment Assisted-by: Claude Code Co-Authored-By: Claude Code <noreply@anthropic.com> * fix(scripts): validate MAX_PARALLEL and fix zombie processes on exit - Validate MAX_PARALLEL is a positive integer, exit with clear error otherwise (prevents infinite hang with 0 or crash with non-numeric) - Consolidate 3 separate trap EXIT calls into one — they were overwriting each other, so only the last one ran (pre-existing bug causing kubectl port-forward zombies and TMPDIR not being cleaned) - Remove unused kanikoLogsPid variable Assisted-by: Claude Code Co-Authored-By: Claude Code <noreply@anthropic.com> * fix(scripts): replace kill 0 with jobs -p to avoid killing parent process group kill 0 sends SIGTERM to the entire process group including the parent shell/CI harness, causing segfaults on normal exit. Use jobs -p to target only this script's background jobs. Split INT/TERM from EXIT to avoid re-entrant cleanup. Assisted-by: Claude Code Co-Authored-By: Claude Code <noreply@anthropic.com> --------- Co-authored-by: Claude Code <noreply@anthropic.com>
1 parent ea59877 commit c417781

1 file changed

Lines changed: 122 additions & 48 deletions

File tree

.rhdh/scripts/install-rhdh-catalog-source.sh

Lines changed: 122 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@ NAMESPACE_SUBSCRIPTION="rhdh-operator"
1414
OLM_CHANNEL="fast"
1515
UPSTREAM_IIB_OVERRIDE=""
1616
INSTALL_PLAN_APPROVAL="Automatic"
17+
MAX_PARALLEL="${MAX_PARALLEL:-10}"
18+
if ! [[ "$MAX_PARALLEL" =~ ^[0-9]+$ ]] || [[ "$MAX_PARALLEL" -lt 1 ]]; then
19+
echo "[ERROR] MAX_PARALLEL must be a positive integer, got: '$MAX_PARALLEL'" >&2
20+
exit 1
21+
fi
1722

1823
function logf() {
1924
set -euo pipefail
@@ -150,56 +155,120 @@ function k8s_check_bundle_manifest_default_config() {
150155
echo "ok"
151156
}
152157

158+
# Writes sed replacement commands to sed_commands_dir for the caller to batch-apply after all bundles complete.
159+
function process_bundle() {
160+
set -euo pipefail
161+
162+
local bundleImg="$1"
163+
local originalBundleImg="$2"
164+
local digest="$3"
165+
local my_registry="$4"
166+
local internal_registry_url="$5"
167+
local sed_commands_dir="$6"
168+
local bundle_id="$7"
169+
170+
local bundle_dir="bundles/${digest}"
171+
mkdir -p "${bundle_dir}"
172+
173+
# Failed copies are faster than successful inspects.
174+
if ! skopeo copy "docker://$bundleImg" "oci:./${bundle_dir}/src:latest" 2>"${bundle_dir}/copy.err"; then
175+
debugf "bundle #${bundle_id}: skopeo copy failed, skipping (see ${bundle_dir}/copy.err)" >&2
176+
return 0
177+
fi
178+
debugf "bundle #${bundle_id}: pulled ${bundleImg}" >&2
179+
180+
umoci unpack --image "./${bundle_dir}/src:latest" "./${bundle_dir}/unpacked" --rootless
181+
182+
for folder in manifests metadata; do
183+
for file in "./${bundle_dir}/unpacked/rootfs/${folder}"/*; do
184+
if [ -f "$file" ]; then
185+
sed -i 's#registry.redhat.io/rhdh#quay.io/rhdh#g' "$file"
186+
sed -i 's#registry.stage.redhat.io/rhdh#quay.io/rhdh#g' "$file"
187+
sed -i 's#registry-proxy.engineering.redhat.com/rh-osbs/rhdh-#quay.io/rhdh/#g' "$file"
188+
fi
189+
done
190+
done
191+
192+
umoci repack --image "./${bundle_dir}/src:latest" "./${bundle_dir}/unpacked"
193+
194+
local newBundleImage="${my_registry}/rhdh/rhdh-operator-bundle:${digest}"
195+
skopeo copy --dest-tls-verify=false "oci:./${bundle_dir}/src:latest" "docker://${newBundleImage}"
196+
debugf "bundle #${bundle_id}: pushed to ${newBundleImage}" >&2
197+
198+
local newBundleImageAsInt="${internal_registry_url}/rhdh/rhdh-operator-bundle:${digest}"
199+
# Each worker writes to its own file (keyed by digest) — no locking needed.
200+
echo "s#${originalBundleImg}#${newBundleImageAsInt}#g" > "${sed_commands_dir}/${digest}.sed"
201+
}
202+
153203
function update_refs_in_iib_bundles() {
154204
set -euo pipefail
155205

156206
local internal_registry_url="$1"
157207
local my_registry="$2"
158-
# 2. Render the IIB locally, modify any references to the internal registries with their mirrors on Quay
159-
# and push the updates to the internal cluster registry
160-
for bundleImg in $(grep -E '^image: .*operator-bundle' "${TMPDIR}/rhdh/rhdh/render.yaml" | awk '{print $2}' | uniq); do
161-
originalBundleImg="$bundleImg"
162-
digest="${originalBundleImg##*@sha256:}"
208+
209+
local bundle_images
210+
bundle_images=$(grep -E '^image: .*operator-bundle' "${TMPDIR}/rhdh/rhdh/render.yaml" | awk '{print $2}' | uniq)
211+
212+
local total_bundles
213+
total_bundles=$(echo "$bundle_images" | wc -l | tr -d ' ')
214+
infof "Processing ${total_bundles} bundles (max ${MAX_PARALLEL} parallel)..." >&2
215+
216+
local sed_commands_dir="${TMPDIR}/sed_commands"
217+
mkdir -p "$sed_commands_dir"
218+
219+
local bundle_count=0
220+
local pids=()
221+
222+
for bundleImg in $bundle_images; do
223+
bundle_count=$((bundle_count + 1))
224+
local originalBundleImg="$bundleImg"
225+
local digest="${originalBundleImg##*@sha256:}"
163226
bundleImg="${bundleImg/registry.stage.redhat.io/quay.io}"
164227
bundleImg="${bundleImg/registry.redhat.io/quay.io}"
165228
bundleImg="${bundleImg/registry-proxy.engineering.redhat.com\/rh-osbs\/rhdh-/quay.io\/rhdh\/}"
166-
debugf "$originalBundleImg => $bundleImg"
167-
if skopeo inspect "docker://$bundleImg" &> /dev/null; then
168-
newBundleImage="${my_registry}/rhdh/rhdh-operator-bundle:${digest}"
169-
newBundleImageAsInt="${internal_registry_url}/rhdh/rhdh-operator-bundle:${digest}"
170-
mkdir -p "bundles/$digest"
171-
172-
debugf "Copying and unpacking image $bundleImg locally..."
173-
skopeo copy "docker://$bundleImg" "oci:./bundles/${digest}/src:latest"
174-
umoci unpack --image "./bundles/${digest}/src:latest" "./bundles/${digest}/unpacked" --rootless
175-
176-
# Replace the occurrences in the .csv.yaml or .clusterserviceversion.yaml files
177-
debugf "Replacing refs to internal registry in bundle image $bundleImg..."
178-
for folder in manifests metadata; do
179-
for file in "./bundles/${digest}/unpacked/rootfs/${folder}"/*; do
180-
if [ -f "$file" ]; then
181-
debugf "replacing refs to internal registries in file '${file}'"
182-
sed -i 's#registry.redhat.io/rhdh#quay.io/rhdh#g' "$file"
183-
sed -i 's#registry.stage.redhat.io/rhdh#quay.io/rhdh#g' "$file"
184-
sed -i 's#registry-proxy.engineering.redhat.com/rh-osbs/rhdh-#quay.io/rhdh/#g' "$file"
185-
fi
186-
done
229+
debugf "bundle #${bundle_count}/${total_bundles}: $originalBundleImg => $bundleImg" >&2
230+
231+
# Portable alternative to `wait -n` (not available in all bash versions)
232+
while true; do
233+
local running=0
234+
for pid in ${pids[@]+"${pids[@]}"}; do
235+
if kill -0 "$pid" 2>/dev/null; then
236+
running=$((running + 1))
237+
fi
187238
done
239+
if [[ $running -lt $MAX_PARALLEL ]]; then
240+
break
241+
fi
242+
sleep 0.2
243+
done
188244

189-
# repack the image with the changes
190-
debugf "Repacking image ./bundles/${digest}/src => ./bundles/${digest}/unpacked..."
191-
umoci repack --image "./bundles/${digest}/src:latest" "./bundles/${digest}/unpacked"
192-
193-
# Push the bundle to the internal cluster registry
194-
debugf "Pushing updated image: ./bundles/${digest}/src => ${newBundleImage}..."
195-
skopeo copy --dest-tls-verify=false "oci:./bundles/${digest}/src:latest" "docker://${newBundleImage}"
245+
process_bundle "$bundleImg" "$originalBundleImg" "$digest" "$my_registry" "$internal_registry_url" "$sed_commands_dir" "$bundle_count" &
246+
pids+=($!)
247+
done
196248

197-
sed -i "s#${originalBundleImg}#${newBundleImageAsInt}#g" "${TMPDIR}/rhdh/rhdh/render.yaml"
249+
local failed=0
250+
for pid in ${pids[@]+"${pids[@]}"}; do
251+
if ! wait "$pid"; then
252+
failed=$((failed + 1))
198253
fi
199254
done
255+
if [[ $failed -gt 0 ]]; then
256+
errorf "${failed} bundle(s) failed to process" >&2
257+
return 1
258+
fi
200259

201-
# 3. Regenerate the IIB image with the local changes to the render.yaml file and build and push it from within the cluster
202-
debugf "Regenerating IIB Dockerfile with updated refs..."
260+
local sed_files
261+
sed_files=$(find "$sed_commands_dir" -name '*.sed' 2>/dev/null)
262+
if [[ -n "$sed_files" ]]; then
263+
local combined_sed="${TMPDIR}/combined_sed_commands.txt"
264+
cat "$sed_commands_dir"/*.sed > "$combined_sed"
265+
local replacement_count
266+
replacement_count=$(wc -l < "$combined_sed" | tr -d ' ')
267+
infof "Applying ${replacement_count} image ref replacements to render.yaml..." >&2
268+
sed -i -f "$combined_sed" "${TMPDIR}/rhdh/rhdh/render.yaml"
269+
fi
270+
271+
debugf "Regenerating IIB Dockerfile with updated refs..." >&2
203272
opm generate dockerfile rhdh/rhdh
204273
}
205274

@@ -216,20 +285,18 @@ function ocp_install() {
216285

217286
set -euo pipefail
218287

219-
render_iib >&2
288+
# render_iib is independent of registry setup below, so run concurrently.
289+
render_iib >&2 &
290+
local render_pid=$!
220291

221292
# 1. Expose the internal cluster registry if not done already
222293
debugf "Exposing cluster registry..." >&2
223294
internal_registry_url="image-registry.openshift-image-registry.svc:5000"
224295
oc patch configs.imageregistry.operator.openshift.io/cluster --patch '{"spec":{"defaultRoute":true}}' --type=merge >&2
225296
my_registry=$(oc get route default-route -n openshift-image-registry --template='{{ .spec.host }}')
226297
skopeo login -u kubeadmin -p "$(oc whoami -t)" --tls-verify=false "$my_registry" >&2
227-
if oc -n openshift-marketplace get secret internal-reg-auth-for-rhdh &> /dev/null; then
228-
oc -n openshift-marketplace delete secret internal-reg-auth-for-rhdh >&2
229-
fi
230-
if oc -n openshift-marketplace get secret internal-reg-ext-auth-for-rhdh &> /dev/null; then
231-
oc -n openshift-marketplace delete secret internal-reg-ext-auth-for-rhdh >&2
232-
fi
298+
oc -n openshift-marketplace delete secret internal-reg-auth-for-rhdh --ignore-not-found >&2
299+
oc -n openshift-marketplace delete secret internal-reg-ext-auth-for-rhdh --ignore-not-found >&2
233300
oc -n openshift-marketplace create secret docker-registry internal-reg-ext-auth-for-rhdh \
234301
--docker-server="${my_registry}" \
235302
--docker-username=kubeadmin \
@@ -250,6 +317,11 @@ function ocp_install() {
250317
oc policy add-role-to-user system:image-puller system:serviceaccount:openshift-marketplace:default -n openshift-marketplace >&2 || true
251318
oc policy add-role-to-user system:image-puller system:serviceaccount:rhdh-operator:default -n rhdh-operator >&2 || true
252319

320+
if ! wait "$render_pid"; then
321+
errorf "opm render failed" >&2
322+
return 1
323+
fi
324+
253325
# 3. Regenerate the IIB image with the local changes to the render.yaml file and build and push it from within the cluster
254326
update_refs_in_iib_bundles "$internal_registry_url" "$my_registry" >&2
255327

@@ -407,7 +479,6 @@ EOF
407479
cat "${registry_port_fwd_out}"
408480
return 1
409481
fi
410-
trap '[[ -n "${port_fwd_pid:-}" ]] && kill ${port_fwd_pid} || true' EXIT
411482

412483
local portFwdLocalPort
413484
portFwdLocalPort=$(grep -oP '127\.0\.0\.1:\K[0-9]+' "${registry_port_fwd_out}")
@@ -458,7 +529,6 @@ EOF
458529
local timestamp
459530
local kanikoJobName
460531
local kanikoPod
461-
local kanikoLogsPid
462532
local localContext
463533
timestamp=$(date +%s)
464534
kanikoJobName="kaniko-build-${timestamp}"
@@ -522,8 +592,6 @@ EOF
522592
debugf "Waiting for Kaniko pod $kanikoPod to be ready..." >&2
523593
invoke_cluster_cli -n "${namespace}" wait --for=condition=Ready "pod/$kanikoPod" --timeout=60s >&2
524594
invoke_cluster_cli -n "${namespace}" logs -f "${kanikoPod}" >&2 &
525-
kanikoLogsPid=$!
526-
trap '[[ -n "${kanikoLogsPid:-}" ]] && kill ${kanikoLogsPid} &>/dev/null || true' EXIT
527595

528596
localContext=context.tar.gz
529597
tar -czf "${localContext}" -C rhdh . >&2
@@ -556,8 +624,10 @@ fi
556624
TMPDIR=$(mktemp -d)
557625
pushd "${TMPDIR}" > /dev/null
558626
debugf ">>> WORKING DIR: $TMPDIR <<<"
627+
559628
# shellcheck disable=SC2064
560-
trap "rm -fr $TMPDIR || true" EXIT
629+
trap "rm -fr '$TMPDIR' || true; jobs -p | xargs -r kill 2>/dev/null; wait 2>/dev/null" EXIT
630+
trap "exit 1" INT TERM
561631

562632
detect_ocp_and_set_env_var
563633
if [[ "${IS_OPENSHIFT}" = "true" ]]; then
@@ -750,6 +820,10 @@ if [[ "${IS_OPENSHIFT}" = "true" ]]; then
750820
NAMESPACE_CATALOGSOURCE="openshift-marketplace"
751821
fi
752822

823+
# Delete existing CatalogSource first to force OLM to re-pull the image.
824+
# Without this, if the tag is unchanged but the digest changed (rebuilt IIB), OLM reports "unchanged" and never re-indexes.
825+
invoke_cluster_cli delete catalogsource "${CATALOGSOURCE_NAME}" -n "${NAMESPACE_CATALOGSOURCE}" --ignore-not-found
826+
753827
echo "apiVersion: operators.coreos.com/v1alpha1
754828
kind: CatalogSource
755829
metadata:

0 commit comments

Comments
 (0)