Skip to content

Commit caa3bfa

Browse files
jparrillclaude
andcommitted
fix(OCPBUGS-77530): remove pause mechanism from backup/restore plugins
Remove the pause/unpause logic for HostedClusters, NodePools and CAPI resources (clusters, machines) during backup and restore operations. The pause mechanism was causing a leak where resources could get stuck in a paused state when a HostedCluster deletion races with an OADP backup. The HyperShift nodepool controller's deletion path returns early before reaching the pause/unpause logic, leaving resources with spec.pausedUntil set indefinitely. Changes: - Remove pauseAll/unPauseAll methods and progress-based pause logic from BackupPlugin - Remove UpdateHostedCluster, UpdateNodepools, WaitForPausedPropagated and updateHypershiftResource functions from common utils - Remove CAPI paused annotation (cluster.x-k8s.io/paused) from backup and restore flows - Remove pause audit annotations (paused-by, paused-at) and related constants - Remove shouldLogProgress throttling (only used for pause logging) - Clean up BackupPluginValidator by removing HCPaused/NPaused fields - Remove all related tests Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Juan Manuel Parrilla Madrid <jparrill@redhat.com>
1 parent 941041b commit caa3bfa

8 files changed

Lines changed: 6 additions & 1370 deletions

File tree

pkg/common/types.go

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package common
33
import "time"
44

55
const (
6-
CAPIPausedAnnotationName string = "cluster.x-k8s.io/paused"
76
CommonBackupAnnotationName string = "hypershift.openshift.io/common-backup-plugin"
87
CommonRestoreAnnotationName string = "hypershift.openshift.io/common-restore-plugin"
98
FSBackupLabelName string = "hypershift.openshift.io/fsbackup"
@@ -15,22 +14,15 @@ const (
1514
PluginConfigMapName string = "hypershift-oadp-plugin-config"
1615

1716
// Default values for the backup plugin.
18-
defaultPVBackupTimeout time.Duration = 30 * time.Minute
19-
defaultPVBackupCheckPace time.Duration = 10 * time.Second
20-
defaultWaitForPausedTimeout time.Duration = 2 * time.Minute
21-
defaultWaitForTimeout time.Duration = 5 * time.Minute
17+
defaultPVBackupTimeout time.Duration = 30 * time.Minute
18+
defaultPVBackupCheckPace time.Duration = 10 * time.Second
2219

2320
DefaultK8sSAFilePath string = "/var/run/secrets/kubernetes.io/serviceaccount"
2421
KubevirtRHCOSLabel string = "hypershift.openshift.io/is-kubevirt-rhcos"
2522

2623
// Integration with Hypershift, more info here: https://github.com/openshift/hypershift/pull/6195
2724
HostedClusterRestoredFromBackupAnnotation string = "hypershift.openshift.io/restored-from-backup"
2825

29-
// OADP Plugin audit annotations for pause operations
30-
OADPPausedByAnnotation string = "oadp.openshift.io/paused-by"
31-
OADPPausedAtAnnotation string = "oadp.openshift.io/paused-at"
32-
HypershiftOADPPluginName string = "hypershift-oadp-plugin"
33-
3426
// hypershift/cluster-api kinds
3527
HostedClusterKind string = "HostedCluster"
3628
HostedControlPlaneKind string = "HostedControlPlane"

pkg/common/utils.go

Lines changed: 0 additions & 248 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ import (
2424
"k8s.io/apimachinery/pkg/types"
2525
"k8s.io/apimachinery/pkg/util/wait"
2626
"k8s.io/client-go/rest"
27-
"k8s.io/utils/ptr"
2827
cr "sigs.k8s.io/controller-runtime"
2928
crclient "sigs.k8s.io/controller-runtime/pkg/client"
3029
)
@@ -449,239 +448,6 @@ func WaitForPodVolumeBackup(ctx context.Context, c crclient.Client, log logrus.F
449448
return true, nil
450449
}
451450

452-
// WaitForPausedPropagated waits for the HostedControlPlane (HCP) associated with the given HostedCluster (HC)
453-
// to be paused. It polls the status of the HCP at regular intervals until the HCP is found to be paused or the
454-
// specified timeout is reached.
455-
func WaitForPausedPropagated(ctx context.Context, c crclient.Client, log logrus.FieldLogger, hc *hyperv1.HostedCluster, timeout time.Duration, paused string) error {
456-
if timeout == 0 {
457-
timeout = defaultWaitForPausedTimeout
458-
}
459-
hcpNamespace := GetHCPNamespace(hc.Name, hc.Namespace)
460-
461-
log = log.WithFields(logrus.Fields{
462-
"namespace": hc.Namespace,
463-
"name": hc.Name,
464-
})
465-
466-
waitCtx, cancel := context.WithTimeout(ctx, timeout)
467-
defer cancel()
468-
469-
err := wait.PollUntilContextCancel(waitCtx, 5*time.Second, true, func(ctx context.Context) (bool, error) {
470-
hcp := &hyperv1.HostedControlPlane{}
471-
if err := c.Get(ctx, types.NamespacedName{Name: hc.Name, Namespace: hcpNamespace}, hcp); err != nil {
472-
if apierrors.IsNotFound(err) {
473-
return true, nil
474-
}
475-
log.Info(err, "failed to get HostedControlPlane")
476-
return false, err
477-
}
478-
log.Infof("waiting for HCP to set PausedUntil to %s", paused)
479-
480-
// For unpause (paused == ""), we expect PausedUntil to be nil
481-
// For pause (paused != ""), we expect PausedUntil to equal paused
482-
if paused == "" {
483-
if hcp.Spec.PausedUntil == nil {
484-
log.Debug("HostedControlPlane is unpaused (PausedUntil is nil)")
485-
return true, nil
486-
}
487-
} else {
488-
if hcp.Spec.PausedUntil != nil && *hcp.Spec.PausedUntil == paused {
489-
log.Debug("HostedControlPlane is paused")
490-
return true, nil
491-
}
492-
}
493-
494-
return false, nil
495-
})
496-
497-
if err != nil {
498-
log.Errorf("giving up, HCP was not updated in the expected timeout: %v", err)
499-
return err
500-
}
501-
502-
return err
503-
}
504-
505-
// updateHypershiftResource updates Hypershift resources (HostedCluster or NodePool) with pause functionality and audit annotations.
506-
// This unified function handles both resource types and includes atomic annotation updates.
507-
func updateHypershiftResource(
508-
ctx context.Context,
509-
c crclient.Client,
510-
log logrus.FieldLogger,
511-
paused string,
512-
namespaces []string,
513-
resourceType string,
514-
listObjFactory func() crclient.ObjectList,
515-
itemsExtractor func(crclient.ObjectList) []crclient.Object,
516-
pausedUntilGetter func(crclient.Object) *string,
517-
pausedUntilSetter func(crclient.Object, *string),
518-
postUpdateHook func(context.Context, crclient.Client, logrus.FieldLogger, crclient.Object, time.Duration, string) error,
519-
) error {
520-
// Get the appropriate list object
521-
listObj := listObjFactory()
522-
523-
// Search through namespaces for resources
524-
var items []crclient.Object
525-
for _, ns := range namespaces {
526-
if err := c.List(ctx, listObj, crclient.InNamespace(ns)); err != nil {
527-
return fmt.Errorf("failed to list %s in namespace %s: %w", resourceType, ns, err)
528-
}
529-
530-
items = itemsExtractor(listObj)
531-
if len(items) > 0 {
532-
log.Debugf("found %s in namespace %s", resourceType, ns)
533-
break
534-
}
535-
}
536-
for _, item := range items {
537-
// Create a retry loop with improved backoff for better conflict resolution
538-
backoff := wait.Backoff{
539-
Steps: 10, // Increased from 5 to 10
540-
Duration: 500 * time.Millisecond, // Reduced initial duration for faster retries
541-
Factor: 2.0,
542-
Jitter: 0.1,
543-
Cap: 30 * time.Second, // Cap maximum wait time to 30 seconds
544-
}
545-
546-
err := wait.ExponentialBackoff(backoff, func() (bool, error) {
547-
// Get the latest version of the resource
548-
current := item.DeepCopyObject().(crclient.Object)
549-
if err := c.Get(ctx, crclient.ObjectKeyFromObject(item), current); err != nil {
550-
return false, err
551-
}
552-
553-
// Check if update is needed
554-
currentPausedUntil := pausedUntilGetter(current)
555-
var needsUpdate bool
556-
if paused == "" {
557-
// For unpause: we need to set pausedUntil to nil, so update needed if it's currently not nil
558-
needsUpdate = currentPausedUntil != nil
559-
} else {
560-
// For pause: we need to set pausedUntil to the paused value
561-
needsUpdate = currentPausedUntil == nil || *currentPausedUntil != paused
562-
}
563-
564-
if needsUpdate {
565-
switch paused {
566-
case "":
567-
log.Infof("setting PauseUntil to nil (unpause) in %s %s", resourceType, current.GetName())
568-
pausedUntilSetter(current, nil)
569-
removePauseAuditAnnotations(current)
570-
case "true":
571-
log.Infof("setting PauseUntil to %s in %s %s", paused, resourceType, current.GetName())
572-
pausedUntilSetter(current, ptr.To(paused))
573-
addPauseAuditAnnotations(current)
574-
}
575-
576-
// Perform the atomic update
577-
if err := c.Update(ctx, current); err != nil {
578-
if apierrors.IsConflict(err) {
579-
log.Infof("Conflict detected updating %s %s, retrying...", resourceType, current.GetName())
580-
return false, nil
581-
}
582-
return false, err
583-
}
584-
585-
// Call post-update hook if provided (for HostedCluster propagation validation)
586-
if postUpdateHook != nil {
587-
if err := postUpdateHook(ctx, c, log, current, defaultWaitForPausedTimeout, paused); err != nil {
588-
return false, err
589-
}
590-
}
591-
592-
log.Infof("Successfully set %s %s PausedUntil to %s", resourceType, current.GetName(), paused)
593-
} else {
594-
log.Debugf("%s %s already has PausedUntil set to %s, no update needed", resourceType, current.GetName(), paused)
595-
}
596-
597-
return true, nil
598-
})
599-
600-
if err != nil {
601-
return fmt.Errorf("failed to update %s %s after retries: %w", resourceType, item.GetName(), err)
602-
}
603-
}
604-
605-
return nil
606-
}
607-
608-
// UpdateHostedCluster updates the HostedCluster's necessary fields in the specified namespaces.
609-
func UpdateHostedCluster(ctx context.Context, c crclient.Client, log logrus.FieldLogger, paused string, namespaces []string) error {
610-
return updateHypershiftResource(
611-
ctx, c, log, paused, namespaces, "HostedCluster",
612-
613-
// listObjFactory - creates a new HostedClusterList
614-
func() crclient.ObjectList {
615-
return &hyperv1.HostedClusterList{Items: []hyperv1.HostedCluster{}}
616-
},
617-
618-
// itemsExtractor - extracts HostedCluster items as crclient.Object slice
619-
func(list crclient.ObjectList) []crclient.Object {
620-
hcList := list.(*hyperv1.HostedClusterList)
621-
items := make([]crclient.Object, len(hcList.Items))
622-
for i := range hcList.Items {
623-
items[i] = &hcList.Items[i]
624-
}
625-
return items
626-
},
627-
628-
// pausedUntilGetter - gets PausedUntil from HostedCluster
629-
func(obj crclient.Object) *string {
630-
hc := obj.(*hyperv1.HostedCluster)
631-
return hc.Spec.PausedUntil
632-
},
633-
634-
// pausedUntilSetter - sets PausedUntil on HostedCluster
635-
func(obj crclient.Object, paused *string) {
636-
hc := obj.(*hyperv1.HostedCluster)
637-
hc.Spec.PausedUntil = paused
638-
},
639-
640-
// postUpdateHook - validates propagation for HostedClusters
641-
func(ctx context.Context, c crclient.Client, log logrus.FieldLogger, obj crclient.Object, timeout time.Duration, paused string) error {
642-
hc := obj.(*hyperv1.HostedCluster)
643-
log.Debug("checking paused state propagation")
644-
return WaitForPausedPropagated(ctx, c, log, hc, timeout, paused)
645-
},
646-
)
647-
}
648-
649-
// UpdateNodepools updates the NodePool's necessary fields in the specified namespaces.
650-
func UpdateNodepools(ctx context.Context, c crclient.Client, log logrus.FieldLogger, paused string, namespaces []string) error {
651-
return updateHypershiftResource(
652-
ctx, c, log, paused, namespaces, "NodePool",
653-
654-
// listObjFactory - creates a new NodePoolList
655-
func() crclient.ObjectList {
656-
return &hyperv1.NodePoolList{}
657-
},
658-
659-
// itemsExtractor - extracts NodePool items as crclient.Object slice
660-
func(list crclient.ObjectList) []crclient.Object {
661-
npList := list.(*hyperv1.NodePoolList)
662-
items := make([]crclient.Object, len(npList.Items))
663-
for i := range npList.Items {
664-
items[i] = &npList.Items[i]
665-
}
666-
return items
667-
},
668-
669-
// pausedUntilGetter - gets PausedUntil from NodePool
670-
func(obj crclient.Object) *string {
671-
np := obj.(*hyperv1.NodePool)
672-
return np.Spec.PausedUntil
673-
},
674-
675-
// pausedUntilSetter - sets PausedUntil on NodePool
676-
func(obj crclient.Object, paused *string) {
677-
np := obj.(*hyperv1.NodePool)
678-
np.Spec.PausedUntil = paused
679-
},
680-
681-
// postUpdateHook - NodePools don't need propagation validation
682-
nil,
683-
)
684-
}
685451

686452
// GetCurrentNamespace reads the namespace from the Kubernetes service account
687453
// token file and returns it as a string. The file is expected to be located at
@@ -730,20 +496,6 @@ func RemoveAnnotation(metadata metav1.Object, key string) {
730496
metadata.SetAnnotations(annotations)
731497
}
732498

733-
// addPauseAuditAnnotations adds OADP audit annotations when pausing a resource.
734-
// It adds both the paused-by and paused-at annotations atomically.
735-
func addPauseAuditAnnotations(obj metav1.Object) {
736-
timestamp := time.Now().Format(time.RFC3339)
737-
AddAnnotation(obj, OADPPausedByAnnotation, HypershiftOADPPluginName)
738-
AddAnnotation(obj, OADPPausedAtAnnotation, timestamp)
739-
}
740-
741-
// removePauseAuditAnnotations removes OADP audit annotations when unpausing a resource.
742-
// It removes both the paused-by and paused-at annotations.
743-
func removePauseAuditAnnotations(obj metav1.Object) {
744-
RemoveAnnotation(obj, OADPPausedByAnnotation)
745-
RemoveAnnotation(obj, OADPPausedAtAnnotation)
746-
}
747499

748500
// AddLabel adds a label with the specified key and value to the given metadata object.
749501
// If the metadata object does not have any labels, a new map is created to store the label.

0 commit comments

Comments
 (0)