Skip to content

Commit 05a04f3

Browse files
Rohit Patilropatil010
andcommitted
Add auto-cleanup and improved error messaging for bundle unpack jobs
This change improves the bundle unpacking experience by adding automatic cleanup of failed/completed jobs and providing better troubleshooting guidance to users. Changes: - Set TTLSecondsAfterFinished to 300 seconds (5 minutes) for bundle unpack jobs to enable automatic retry when the root cause is fixed - Enhanced error messages with detailed troubleshooting steps including: - Information about automatic retry in ~5 minutes - Manual job deletion commands for immediate retry - Common failure causes (ICSP, auth, image issues) - Specific guidance for ImageContentSourcePolicy configuration The 5-minute TTL provides a balance between fast recovery and preventing job churn on permanent failures. Co-Authored-By: Rohit Patil <ropatil@redhat.com>
1 parent 4e18ba8 commit 05a04f3

4 files changed

Lines changed: 85 additions & 17 deletions

File tree

pkg/controller/bundle/bundle_unpacker.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,10 @@ func (c *ConfigMapUnpacker) job(cmRef *corev1.ObjectReference, bundlePath string
102102
},
103103
},
104104
Spec: batchv1.JobSpec{
105-
//ttlSecondsAfterFinished: 0 // can use in the future to not have to clean up job
105+
// Auto-cleanup failed/completed jobs after 5 minutes to allow automatic retry when root cause is fixed
106+
// 5 minutes provides balance between fast recovery and preventing job churn on permanent failures
107+
// See: https://kubernetes.io/docs/concepts/workloads/controllers/job/#ttl-mechanism-for-finished-jobs
108+
TTLSecondsAfterFinished: ptr.To[int32](300),
106109
Template: corev1.PodTemplateSpec{
107110
ObjectMeta: metav1.ObjectMeta{
108111
Name: cmRef.Name,

pkg/controller/bundle/bundle_unpacker_test.go

Lines changed: 59 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ func TestConfigMapUnpacker(t *testing.T) {
5050
return start
5151
}
5252
backoffLimit := int32(3)
53+
ttlSecondsAfterFinished := int32(300) // 5 minutes TTL for automatic cleanup
5354
// Used to set the default value for job.spec.ActiveDeadlineSeconds
5455
// that would normally be passed from the cmdline flag
5556
defaultUnpackDuration := 10 * time.Minute
@@ -263,8 +264,9 @@ func TestConfigMapUnpacker(t *testing.T) {
263264
},
264265
Spec: batchv1.JobSpec{
265266
// The expected job's timeout should be set to the custom annotation timeout
266-
ActiveDeadlineSeconds: &customAnnotationTimeoutSeconds,
267-
BackoffLimit: &backoffLimit,
267+
ActiveDeadlineSeconds: &customAnnotationTimeoutSeconds,
268+
BackoffLimit: &backoffLimit,
269+
TTLSecondsAfterFinished: &ttlSecondsAfterFinished,
268270
Template: corev1.PodTemplateSpec{
269271
ObjectMeta: metav1.ObjectMeta{
270272
Name: pathHash,
@@ -484,8 +486,9 @@ func TestConfigMapUnpacker(t *testing.T) {
484486
},
485487
},
486488
Spec: batchv1.JobSpec{
487-
ActiveDeadlineSeconds: &defaultUnpackTimeoutSeconds,
488-
BackoffLimit: &backoffLimit,
489+
ActiveDeadlineSeconds: &defaultUnpackTimeoutSeconds,
490+
BackoffLimit: &backoffLimit,
491+
TTLSecondsAfterFinished: &ttlSecondsAfterFinished,
489492
Template: corev1.PodTemplateSpec{
490493
ObjectMeta: metav1.ObjectMeta{
491494
Name: digestHash,
@@ -744,8 +747,9 @@ func TestConfigMapUnpacker(t *testing.T) {
744747
},
745748
},
746749
Spec: batchv1.JobSpec{
747-
ActiveDeadlineSeconds: &defaultUnpackTimeoutSeconds,
748-
BackoffLimit: &backoffLimit,
750+
ActiveDeadlineSeconds: &defaultUnpackTimeoutSeconds,
751+
BackoffLimit: &backoffLimit,
752+
TTLSecondsAfterFinished: &ttlSecondsAfterFinished,
749753
Template: corev1.PodTemplateSpec{
750754
ObjectMeta: metav1.ObjectMeta{
751755
Name: digestHash,
@@ -999,8 +1003,9 @@ func TestConfigMapUnpacker(t *testing.T) {
9991003
},
10001004
},
10011005
Spec: batchv1.JobSpec{
1002-
ActiveDeadlineSeconds: &defaultUnpackTimeoutSeconds,
1003-
BackoffLimit: &backoffLimit,
1006+
ActiveDeadlineSeconds: &defaultUnpackTimeoutSeconds,
1007+
BackoffLimit: &backoffLimit,
1008+
TTLSecondsAfterFinished: &ttlSecondsAfterFinished,
10041009
Template: corev1.PodTemplateSpec{
10051010
ObjectMeta: metav1.ObjectMeta{
10061011
Name: pathHash,
@@ -1224,8 +1229,9 @@ func TestConfigMapUnpacker(t *testing.T) {
12241229
},
12251230
},
12261231
Spec: batchv1.JobSpec{
1227-
ActiveDeadlineSeconds: &defaultUnpackTimeoutSeconds,
1228-
BackoffLimit: &backoffLimit,
1232+
ActiveDeadlineSeconds: &defaultUnpackTimeoutSeconds,
1233+
BackoffLimit: &backoffLimit,
1234+
TTLSecondsAfterFinished: &ttlSecondsAfterFinished,
12291235
Template: corev1.PodTemplateSpec{
12301236
ObjectMeta: metav1.ObjectMeta{
12311237
Name: pathHash,
@@ -1462,8 +1468,9 @@ func TestConfigMapUnpacker(t *testing.T) {
14621468
},
14631469
},
14641470
Spec: batchv1.JobSpec{
1465-
ActiveDeadlineSeconds: &defaultUnpackTimeoutSeconds,
1466-
BackoffLimit: &backoffLimit,
1471+
ActiveDeadlineSeconds: &defaultUnpackTimeoutSeconds,
1472+
BackoffLimit: &backoffLimit,
1473+
TTLSecondsAfterFinished: &ttlSecondsAfterFinished,
14671474
Template: corev1.PodTemplateSpec{
14681475
ObjectMeta: metav1.ObjectMeta{
14691476
Name: pathHash,
@@ -2075,3 +2082,43 @@ func TestSortUnpackJobs(t *testing.T) {
20752082
assert.ElementsMatch(t, tc.expectedToDelete, toDelete)
20762083
}
20772084
}
2085+
2086+
// TestBundleUnpackJobHasTTL verifies that bundle unpack jobs have TTL set for automatic cleanup
2087+
func TestBundleUnpackJobHasTTL(t *testing.T) {
2088+
cmRef := &corev1.ObjectReference{
2089+
APIVersion: "v1",
2090+
Kind: "ConfigMap",
2091+
Name: "test-bundle",
2092+
Namespace: "test-namespace",
2093+
UID: "test-uid",
2094+
}
2095+
bundlePath := "quay.io/test/bundle:v1.0.0"
2096+
secrets := []corev1.LocalObjectReference{{Name: "test-secret"}}
2097+
timeout := 10 * time.Minute
2098+
2099+
unpacker := &ConfigMapUnpacker{
2100+
opmImage: "test-opm:latest",
2101+
utilImage: "test-util:latest",
2102+
unpackTimeout: timeout,
2103+
runAsUser: 1001,
2104+
}
2105+
2106+
job := unpacker.job(cmRef, bundlePath, secrets, timeout)
2107+
2108+
// Verify TTL is set to 5 minutes (300 seconds)
2109+
require.NotNil(t, job.Spec.TTLSecondsAfterFinished, "TTLSecondsAfterFinished should be set")
2110+
assert.Equal(t, int32(300), *job.Spec.TTLSecondsAfterFinished, "TTL should be 300 seconds (5 minutes) for faster recovery")
2111+
2112+
// Verify other important job specs are still set correctly
2113+
assert.NotNil(t, job.Spec.BackoffLimit, "BackoffLimit should be set")
2114+
assert.Equal(t, int32(3), *job.Spec.BackoffLimit)
2115+
2116+
assert.NotNil(t, job.Spec.ActiveDeadlineSeconds, "ActiveDeadlineSeconds should be set")
2117+
assert.Equal(t, int64(timeout.Seconds()), *job.Spec.ActiveDeadlineSeconds)
2118+
2119+
// Verify job metadata
2120+
assert.Equal(t, cmRef.Name, job.Name)
2121+
assert.Equal(t, cmRef.Namespace, job.Namespace)
2122+
assert.Contains(t, job.Labels, install.OLMManagedLabelKey)
2123+
assert.Contains(t, job.Labels, BundleUnpackRefLabel)
2124+
}

pkg/controller/operators/catalog/operator.go

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1398,14 +1398,32 @@ func (o *Operator) syncResolvingNamespace(obj interface{}) error {
13981398
// with a condition indicating the failure.
13991399
isFailed, cond := hasBundleLookupFailureCondition(bundleLookups)
14001400
if isFailed {
1401-
err := fmt.Errorf("bundle unpacking failed. Reason: %v, and Message: %v", cond.Reason, cond.Message)
1402-
logger.Infof("%v", err)
1401+
// Create detailed error message with troubleshooting guidance
1402+
detailedMessage := fmt.Sprintf(`Bundle unpacking failed. Reason: %v, Message: %v
1403+
1404+
The failed bundle unpack job will be automatically cleaned up and retried in approximately 5 minutes.
1405+
1406+
To retry immediately, delete the failed job:
1407+
kubectl get jobs -n openshift-marketplace -l operatorframework.io/bundle-unpack-ref
1408+
kubectl delete job <job-name> -n openshift-marketplace
1409+
1410+
Common causes:
1411+
• Missing ImageContentSourcePolicy for File-Based Catalogs (required for ppc64le, s390x, arm64)
1412+
• Image pull authentication failures
1413+
• Non-existent bundle image or incorrect image reference
1414+
• Network connectivity issues to image registry
1415+
1416+
For ImageContentSourcePolicy issues:
1417+
1. Apply the correct ICSP configuration
1418+
2. Wait ~5 minutes for automatic retry OR delete the failed job manually`, cond.Reason, cond.Message)
1419+
1420+
logger.Infof("bundle unpacking failed. Reason: %v, Message: %v", cond.Reason, cond.Message)
14031421

14041422
_, updateErr := o.updateSubscriptionStatuses(
14051423
o.setSubsCond(subs, v1alpha1.SubscriptionCondition{
14061424
Type: v1alpha1.SubscriptionBundleUnpackFailed,
14071425
Reason: "BundleUnpackFailed",
1408-
Message: err.Error(),
1426+
Message: detailedMessage,
14091427
Status: corev1.ConditionTrue,
14101428
}))
14111429
if updateErr != nil {

pkg/controller/operators/catalog/subscriptions_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,7 @@ func TestSyncSubscriptions(t *testing.T) {
563563
{
564564
Type: v1alpha1.SubscriptionBundleUnpackFailed,
565565
Reason: "BundleUnpackFailed",
566-
Message: "bundle unpacking failed. Reason: JobFailed, and Message: unpack job failed",
566+
Message: "Bundle unpacking failed. Reason: JobFailed, Message: unpack job failed\n\nThe failed bundle unpack job will be automatically cleaned up and retried in approximately 5 minutes.\n\nTo retry immediately, delete the failed job:\n kubectl get jobs -n openshift-marketplace -l operatorframework.io/bundle-unpack-ref\n kubectl delete job <job-name> -n openshift-marketplace\n\nCommon causes:\n • Missing ImageContentSourcePolicy for File-Based Catalogs (required for ppc64le, s390x, arm64)\n • Image pull authentication failures\n • Non-existent bundle image or incorrect image reference\n • Network connectivity issues to image registry\n\nFor ImageContentSourcePolicy issues:\n 1. Apply the correct ICSP configuration\n 2. Wait ~5 minutes for automatic retry OR delete the failed job manually",
567567
Status: corev1.ConditionTrue,
568568
},
569569
},

0 commit comments

Comments
 (0)