Skip to content

Commit afa2858

Browse files
committed
Added sync restore in place, while keeping the secrerts and services alive
1 parent 1af4c50 commit afa2858

File tree

2 files changed

+158
-5
lines changed

2 files changed

+158
-5
lines changed

pkg/cluster/cluster.go

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1211,6 +1211,8 @@ func (c *Cluster) Delete() error {
12111211
defer c.mu.Unlock()
12121212
c.eventRecorder.Event(c.GetReference(), v1.EventTypeNormal, "Delete", "Started deletion of cluster resources")
12131213

1214+
isRestoreInPlace := c.Annotations["postgres-operator.zalando.org/action"] == "restore-in-place"
1215+
c.logger.Debugf("restore-in-place: Deleting the cluster, verifying whether resotore-in-place is true or not: %+v\n", isRestoreInPlace)
12141216
if err := c.deleteStreams(); err != nil {
12151217
anyErrors = true
12161218
c.logger.Warningf("could not delete event streams: %v", err)
@@ -1231,7 +1233,7 @@ func (c *Cluster) Delete() error {
12311233
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Delete", "could not delete statefulset: %v", err)
12321234
}
12331235

1234-
if c.OpConfig.EnableSecretsDeletion != nil && *c.OpConfig.EnableSecretsDeletion {
1236+
if c.OpConfig.EnableSecretsDeletion != nil && *c.OpConfig.EnableSecretsDeletion && !isRestoreInPlace {
12351237
if err := c.deleteSecrets(); err != nil {
12361238
anyErrors = true
12371239
c.logger.Warningf("could not delete secrets: %v", err)
@@ -1256,10 +1258,12 @@ func (c *Cluster) Delete() error {
12561258
}
12571259
}
12581260

1259-
if err := c.deleteService(role); err != nil {
1260-
anyErrors = true
1261-
c.logger.Warningf("could not delete %s service: %v", role, err)
1262-
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Delete", "could not delete %s service: %v", role, err)
1261+
if !isRestoreInPlace {
1262+
if err := c.deleteService(role); err != nil {
1263+
anyErrors = true
1264+
c.logger.Warningf("could not delete %s service: %v", role, err)
1265+
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Delete", "could not delete %s service: %v", role, err)
1266+
}
12631267
}
12641268
}
12651269

pkg/controller/postgresql.go

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ import (
1414

1515
v1 "k8s.io/api/core/v1"
1616
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
17+
"k8s.io/apimachinery/pkg/api/errors"
1718
"k8s.io/apimachinery/pkg/types"
19+
"k8s.io/apimachinery/pkg/util/wait"
1820
"k8s.io/client-go/tools/cache"
1921

2022
acidv1 "github.com/zalando/postgres-operator/pkg/apis/acid.zalan.do/v1"
@@ -539,6 +541,13 @@ func (c *Controller) postgresqlUpdate(prev, cur interface{}) {
539541
pgOld := c.postgresqlCheck(prev)
540542
pgNew := c.postgresqlCheck(cur)
541543
if pgOld != nil && pgNew != nil {
544+
545+
if pgNew.Annotations["postgres-operator.zalando.org/action"] == "restore-in-place" {
546+
c.logger.Debugf("restore-in-place: postgresqlUpdate called for cluster %q", pgNew.Name)
547+
c.handlerRestoreInPlace(pgOld, pgNew)
548+
return
549+
}
550+
542551
// Avoid the inifinite recursion for status updates
543552
if reflect.DeepEqual(pgOld.Spec, pgNew.Spec) {
544553
if reflect.DeepEqual(pgNew.Annotations, pgOld.Annotations) {
@@ -568,6 +577,146 @@ func (c *Controller) postgresqlCheck(obj interface{}) *acidv1.Postgresql {
568577
return pg
569578
}
570579

580+
// validateRestoreInPlace checks if the restore parameters are valid
581+
func (c *Controller) validateRestoreInPlace(pgOld, pgNew *acidv1.Postgresql) error {
582+
c.logger.Debugf("restore-in-place: validating restore parameters for cluster %q", pgNew.Name)
583+
584+
if pgNew.Spec.Clone == nil {
585+
return fmt.Errorf("'clone' section is missing in the manifest")
586+
}
587+
588+
// Use ClusterName from CloneDescription
589+
if pgNew.Spec.Clone.ClusterName != pgOld.Name {
590+
return fmt.Errorf("clone cluster name %q does not match the current cluster name %q", pgNew.Spec.Clone.ClusterName, pgOld.Name)
591+
}
592+
593+
// Use EndTimestamp from CloneDescription
594+
cloneTimestamp, err := time.Parse(time.RFC3339, pgNew.Spec.Clone.EndTimestamp)
595+
if err != nil {
596+
return fmt.Errorf("could not parse clone timestamp %q: %v", pgNew.Spec.Clone.EndTimestamp, err)
597+
}
598+
599+
if cloneTimestamp.After(time.Now()) {
600+
return fmt.Errorf("clone timestamp %q is in the future", pgNew.Spec.Clone.EndTimestamp)
601+
}
602+
603+
c.logger.Debugf("restore-in-place: validation successful")
604+
return nil
605+
}
606+
607+
// waitForOldResourcesTermination waits until the postgresql CR and its StatefulSet are terminated
608+
func (c *Controller) waitForOldResourcesTermination(pgOld *acidv1.Postgresql, statefulSetName string) error {
609+
c.logger.Debugf("restore-in-place: Waiting for old CR %q and StatefulSet %q to be fully terminated", pgOld.Name, statefulSetName)
610+
611+
err := wait.PollUntilContextTimeout(context.TODO(), 2*time.Second, 5*time.Minute, true, func(ctx context.Context) (bool, error) {
612+
// Check for CR
613+
_, crErr := c.KubeClient.AcidV1ClientSet.AcidV1().Postgresqls(pgOld.Namespace).Get(ctx, pgOld.Name, metav1.GetOptions{})
614+
crGone := errors.IsNotFound(crErr)
615+
if crErr != nil && !crGone {
616+
c.logger.Errorf("restore-in-place: Error while waiting for CR deletion: %v", crErr)
617+
return false, crErr // A real error occurred
618+
}
619+
620+
// Check for StatefulSet
621+
_, stsErr := c.KubeClient.StatefulSets(pgOld.Namespace).Get(ctx, statefulSetName, metav1.GetOptions{})
622+
stsGone := errors.IsNotFound(stsErr)
623+
if stsErr != nil && !stsGone {
624+
c.logger.Errorf("restore-in-place: Error while waiting for StatefulSet deletion: %v", stsErr)
625+
return false, stsErr // A real error occurred
626+
}
627+
628+
if crGone && stsGone {
629+
c.logger.Debugf("restore-in-place: Both old CR and StatefulSet are fully terminated.")
630+
return true, nil
631+
}
632+
633+
if !crGone {
634+
c.logger.Infof("restore-in-place: still waiting for postgresql CR %q to be deleted", pgOld.Name)
635+
}
636+
if !stsGone {
637+
c.logger.Infof("restore-in-place: still waiting for StatefulSet %q to be deleted", statefulSetName)
638+
}
639+
640+
return false, nil // Not done yet, continue polling.
641+
})
642+
643+
if err != nil {
644+
return fmt.Errorf("error while waiting for old resources to be deleted: %v", err)
645+
}
646+
647+
c.logger.Debugf("restore-in-place: Finished waiting for old resource deletion.")
648+
return nil
649+
}
650+
651+
// handlerRestoreInPlace is to handle the resotre in place, it does few operatons
652+
// 1. Verifies the parameters required for restoring in place
653+
// 2. Removes the old CR if it exists, wait for it, if not present check the err that it is a k8sNotfound error and continue
654+
// 3. Wait for the successful removal of statefulsets, if not present check the err that it is a k8sNotfound error and continue
655+
// 4. Create a new CR with the latest details, while keeping few metadata about restore
656+
func (c *Controller) handlerRestoreInPlace(pgOld, pgNew *acidv1.Postgresql) {
657+
c.logger.Infof("restore-in-place: starting restore-in-place for cluster %q", pgNew.Name)
658+
659+
if err := c.validateRestoreInPlace(pgOld, pgNew); err != nil {
660+
c.logger.Errorf("restore-in-place: validation failed for cluster %q: %v", pgNew.Name, err)
661+
return
662+
}
663+
664+
newPgSpec := pgNew.DeepCopy()
665+
delete(newPgSpec.Annotations, "postgres-operator.zalando.org/action")
666+
newPgSpec.ResourceVersion = ""
667+
newPgSpec.UID = ""
668+
c.logger.Debugf("restore-in-place: newPgSpec after removing annotation: %+v", newPgSpec)
669+
670+
statefulSetName := pgOld.Name // Capture StatefulSet name, it's the same as the cluster name
671+
672+
// Initiate CR deletion first, as requested
673+
c.logger.Debugf("restore-in-place: Attempting direct API deletion of postgresql CR %q", pgOld.Name)
674+
err := c.KubeClient.AcidV1ClientSet.AcidV1().Postgresqls(pgOld.Namespace).Delete(context.TODO(), pgOld.Name, metav1.DeleteOptions{})
675+
if err != nil && !errors.IsNotFound(err) {
676+
c.logger.Errorf("restore-in-place: could not delete postgresql CR via API: %v", err)
677+
return // Stop if there's a critical error deleting the CR
678+
}
679+
c.logger.Debugf("restore-in-place: Direct API deletion of postgresql CR for %q initiated (or CR was already not found).", pgOld.Name)
680+
681+
// Then, initiate cluster sub-resource deletion if the cluster object is in memory
682+
clusterName := util.NameFromMeta(pgOld.ObjectMeta)
683+
c.clustersMu.RLock()
684+
cl, clusterFound := c.clusters[clusterName]
685+
c.clustersMu.RUnlock()
686+
687+
if clusterFound {
688+
c.logger.Debugf("restore-in-place: Cluster object found in memory. Calling cluster.Delete() for %q", clusterName)
689+
if cl.Annotations == nil {
690+
cl.Annotations = make(map[string]string)
691+
}
692+
cl.Annotations["postgres-operator.zalando.org/action"] = "restore-in-place" // User requested to keep this
693+
if err := cl.Delete(); err != nil {
694+
// Log error but continue to ensure we wait for termination
695+
c.logger.Errorf("restore-in-place: error during cluster.Delete() for %q: %v. Proceeding to wait for termination.", clusterName, err)
696+
}
697+
c.logger.Debugf("restore-in-place: cluster.Delete() returned for %q", clusterName)
698+
} else {
699+
c.logger.Warningf("restore-in-place: cluster %q not found in controller's map. Relying on CR deletion to trigger cleanup.", clusterName)
700+
}
701+
702+
if err := c.waitForOldResourcesTermination(pgOld, statefulSetName); err != nil {
703+
c.logger.Errorf("restore-in-place: %v", err)
704+
return
705+
}
706+
707+
// Create a new CR with the latest details
708+
c.logger.Debugf("restore-in-place: Creating new postgresql CR %q", newPgSpec.Name)
709+
_, err = c.KubeClient.AcidV1ClientSet.AcidV1().Postgresqls(newPgSpec.Namespace).Create(context.TODO(), newPgSpec, metav1.CreateOptions{})
710+
if err != nil {
711+
c.logger.Errorf("restore-in-place: could not create postgresql CR for restore-in-place: %v", err)
712+
// If the new CR cannot be created, the user needs to intervene.
713+
return
714+
}
715+
c.logger.Debugf("restore-in-place: New postgresql CR %q created", newPgSpec.Name)
716+
717+
c.logger.Infof("restore-in-place: for cluster %q triggered successfully", pgNew.Name)
718+
}
719+
571720
/*
572721
Ensures the pod service account and role bindings exists in a namespace
573722
before a PG cluster is created there so that a user does not have to deploy

0 commit comments

Comments
 (0)