Skip to content

Commit 19ca281

Browse files
committed
SCF-830: Added state management for cluster hibernation
1 parent d16504e commit 19ca281

10 files changed

Lines changed: 236 additions & 2 deletions

File tree

manifests/postgresql.crd.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3246,6 +3246,13 @@ spec:
32463246
- name
32473247
type: object
32483248
type: array
3249+
lifecycle:
3250+
description: LifecycleSpec describes the lifecycle state of a Postgres
3251+
cluster.
3252+
properties:
3253+
phase:
3254+
type: string
3255+
type: object
32493256
logicalBackupRetention:
32503257
type: string
32513258
logicalBackupSchedule:
@@ -4197,6 +4204,9 @@ spec:
41974204
properties:
41984205
PostgresClusterStatus:
41994206
type: string
4207+
previousNumberOfInstances:
4208+
format: int32
4209+
type: integer
42004210
required:
42014211
- PostgresClusterStatus
42024212
type: object

pkg/apis/acid.zalan.do/v1/const.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ const (
99
ClusterStatusSyncFailed = "SyncFailed"
1010
ClusterStatusAddFailed = "CreateFailed"
1111
ClusterStatusRunning = "Running"
12+
ClusterStatusStopping = "Stopping"
13+
ClusterStatusStopped = "Stopped"
1214
ClusterStatusInvalid = "Invalid"
1315
)
1416

pkg/apis/acid.zalan.do/v1/postgresql.crd.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3246,6 +3246,13 @@ spec:
32463246
- name
32473247
type: object
32483248
type: array
3249+
lifecycle:
3250+
description: LifecycleSpec describes the lifecycle state of a Postgres
3251+
cluster.
3252+
properties:
3253+
phase:
3254+
type: string
3255+
type: object
32493256
logicalBackupRetention:
32503257
type: string
32513258
logicalBackupSchedule:
@@ -4197,6 +4204,9 @@ spec:
41974204
properties:
41984205
PostgresClusterStatus:
41994206
type: string
4207+
previousNumberOfInstances:
4208+
format: int32
4209+
type: integer
42004210
required:
42014211
- PostgresClusterStatus
42024212
type: object

pkg/apis/acid.zalan.do/v1/postgresql_type.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ type PostgresSpec struct {
115115
TLS *TLSDescription `json:"tls,omitempty"`
116116
AdditionalVolumes []AdditionalVolume `json:"additionalVolumes,omitempty"`
117117
Streams []Stream `json:"streams,omitempty"`
118+
Lifecycle *LifecycleSpec `json:"lifecycle,omitempty"`
118119
Env []v1.EnvVar `json:"env,omitempty"`
119120

120121
// deprecated
@@ -257,6 +258,11 @@ type StandbyDescription struct {
257258
StandbyPrimarySlotName string `json:"standby_primary_slot_name,omitempty"`
258259
}
259260

261+
// LifecycleSpec describes the lifecycle state of a Postgres cluster.
262+
type LifecycleSpec struct {
263+
Phase string `json:"phase,omitempty"`
264+
}
265+
260266
// TLSDescription specs TLS properties
261267
type TLSDescription struct {
262268
// +required
@@ -302,7 +308,8 @@ type UserFlags []string
302308

303309
// PostgresStatus contains status of the PostgreSQL cluster (running, creation failed etc.)
304310
type PostgresStatus struct {
305-
PostgresClusterStatus string `json:"PostgresClusterStatus"`
311+
PostgresClusterStatus string `json:"PostgresClusterStatus"`
312+
PreviousNumberOfInstances int32 `json:"previousNumberOfInstances,omitempty"`
306313
}
307314

308315
// ConnectionPooler Options for connection pooler

pkg/apis/acid.zalan.do/v1/util.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,16 @@ func (postgresStatus PostgresStatus) Creating() bool {
101101
return postgresStatus.PostgresClusterStatus == ClusterStatusCreating
102102
}
103103

104+
// Stopping status of cluster
105+
func (postgresStatus PostgresStatus) Stopping() bool {
106+
return postgresStatus.PostgresClusterStatus == ClusterStatusStopping
107+
}
108+
109+
// Stopped status of cluster
110+
func (postgresStatus PostgresStatus) Stopped() bool {
111+
return postgresStatus.PostgresClusterStatus == ClusterStatusStopped
112+
}
113+
104114
func (postgresStatus PostgresStatus) String() string {
105115
return postgresStatus.PostgresClusterStatus
106116
}

pkg/apis/acid.zalan.do/v1/zz_generated.deepcopy.go

Lines changed: 21 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/cluster/cluster.go

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1008,13 +1008,94 @@ func (c *Cluster) Update(oldSpec, newSpec *acidv1.Postgresql) error {
10081008
c.mu.Lock()
10091009
defer c.mu.Unlock()
10101010

1011+
// Block all spec changes when cluster is stopped or stopping
1012+
if c.Status.Stopped() || c.Status.Stopping() {
1013+
lifecyclePhase := ""
1014+
if newSpec.Spec.Lifecycle != nil {
1015+
lifecyclePhase = newSpec.Spec.Lifecycle.Phase
1016+
}
1017+
// During Stopping: block ALL spec changes (no cancellation allowed)
1018+
// During Stopped: only block if keeping lifecycle.phase="stopped"
1019+
if c.Status.Stopping() {
1020+
return fmt.Errorf("cannot update cluster while it is stopping. Wait for it to fully stop first")
1021+
}
1022+
if lifecyclePhase == "stopped" {
1023+
return fmt.Errorf("cannot update cluster while stopped. Remove lifecycle.phase to wake up the cluster")
1024+
}
1025+
}
1026+
10111027
newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusUpdating
10121028

10131029
newSpec, err := c.KubeClient.SetPostgresCRDStatus(c.clusterName(), newSpec)
10141030
if err != nil {
10151031
return fmt.Errorf("could not set cluster status to updating: %w", err)
10161032
}
10171033

1034+
// Check if user is initiating hibernate (Running -> Stopping)
1035+
if c.Status.Running() && newSpec.Spec.Lifecycle != nil && newSpec.Spec.Lifecycle.Phase == "stopped" {
1036+
c.logger.Infof("[lifecycle] initiating hibernate for cluster %s: current numberOfInstances=%d", c.Name, c.Spec.NumberOfInstances)
1037+
1038+
// Store previousNumberOfInstances BEFORE setting numberOfInstances to 0
1039+
newSpec.Status.PreviousNumberOfInstances = c.Spec.NumberOfInstances
1040+
newSpec.Spec.NumberOfInstances = 0
1041+
newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusStopping
1042+
1043+
c.logger.Infof("[lifecycle] hibernate initiated: setting numberOfInstances=0, previousNumberOfInstances=%d", newSpec.Status.PreviousNumberOfInstances)
1044+
1045+
// Update spec first (Update only updates spec when CR has status subresource)
1046+
pgUpdated, err := c.KubeClient.UpdatePostgresCR(c.clusterName(), newSpec)
1047+
if err != nil {
1048+
return fmt.Errorf("could not update spec during hibernate: %w", err)
1049+
}
1050+
c.logger.Infof("[lifecycle] hibernate: spec updated successfully")
1051+
1052+
// Update status separately - we need to preserve the status values we set
1053+
// because UpdatePostgresCR returns object with status zeroed (subresource behavior)
1054+
pgUpdated.Status.PreviousNumberOfInstances = newSpec.Status.PreviousNumberOfInstances
1055+
pgUpdated.Status.PostgresClusterStatus = newSpec.Status.PostgresClusterStatus
1056+
1057+
pgUpdated, err = c.KubeClient.SetPostgresCRDStatus(c.clusterName(), pgUpdated)
1058+
if err != nil {
1059+
return fmt.Errorf("could not update status during hibernate: %w", err)
1060+
}
1061+
c.logger.Infof("[lifecycle] hibernate: status updated successfully, previousNumberOfInstances=%d", pgUpdated.Status.PreviousNumberOfInstances)
1062+
1063+
c.setSpec(pgUpdated)
1064+
return nil
1065+
}
1066+
1067+
// Check if user is waking up from stopped state (Stopped -> Running)
1068+
// This is when user clears lifecycle.phase to wake up the cluster
1069+
if c.Status.Stopped() && (newSpec.Spec.Lifecycle == nil || newSpec.Spec.Lifecycle.Phase != "stopped") {
1070+
if newSpec.Status.PreviousNumberOfInstances > 0 {
1071+
c.logger.Infof("[lifecycle] waking up cluster %s: restoring numberOfInstances=%d", c.Name, newSpec.Status.PreviousNumberOfInstances)
1072+
1073+
// Restore numberOfInstances from previousNumberOfInstances
1074+
newSpec.Spec.NumberOfInstances = newSpec.Status.PreviousNumberOfInstances
1075+
newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusUpdating
1076+
1077+
// Update spec first
1078+
pgUpdated, err := c.KubeClient.UpdatePostgresCR(c.clusterName(), newSpec)
1079+
if err != nil {
1080+
return fmt.Errorf("could not update spec during wake-up: %w", err)
1081+
}
1082+
c.logger.Infof("[lifecycle] wake-up: spec updated successfully")
1083+
1084+
// Update status separately, and clear previousNumberOfInstances after restore
1085+
pgUpdated.Status.PreviousNumberOfInstances = 0 // Clear after successful restore
1086+
pgUpdated.Status.PostgresClusterStatus = newSpec.Status.PostgresClusterStatus
1087+
1088+
pgUpdated, err = c.KubeClient.SetPostgresCRDStatus(c.clusterName(), pgUpdated)
1089+
if err != nil {
1090+
return fmt.Errorf("could not update status during wake-up: %w", err)
1091+
}
1092+
c.logger.Infof("[lifecycle] wake-up: status updated successfully, previousNumberOfInstances cleared")
1093+
1094+
c.setSpec(pgUpdated)
1095+
return nil
1096+
}
1097+
}
1098+
10181099
if !c.isInMaintenanceWindow(newSpec.Spec.MaintenanceWindows) {
10191100
// do not apply any major version related changes yet
10201101
newSpec.Spec.PostgresqlParam.PgVersion = oldSpec.Spec.PostgresqlParam.PgVersion

pkg/cluster/lifecycle.go

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
package cluster
2+
3+
import (
4+
acidv1 "github.com/zalando/postgres-operator/pkg/apis/acid.zalan.do/v1"
5+
)
6+
7+
// manageHibernateState manages cluster hibernate/wake-up state transitions.
8+
// Returns true if sync should continue, false if it should return early.
9+
//
10+
// This function handles the following state transitions:
11+
// - Running -> Stopping: When user sets lifecycle.phase = "stopped"
12+
// - Stopping -> Stopped: When StatefulSet replicas reach 0
13+
// - Stopped -> Updating: When user clears lifecycle.phase (wake-up)
14+
// - Updating -> Running: Normal sync continues, defer sets final status
15+
func (c *Cluster) manageHibernateState(oldSpec acidv1.Postgresql, newSpec *acidv1.Postgresql) bool {
16+
17+
// FIX B: Detect wake-up by comparing oldSpec status vs newSpec status
18+
// When Update() is called, it sets status=Updating before Sync() runs.
19+
// So we need to check if oldSpec.Status was Stopped and newSpec is Updating
20+
// with lifecycle cleared to properly detect wake-up.
21+
isWakingUp := oldSpec.Status.Stopped() &&
22+
newSpec.Status.PostgresClusterStatus == acidv1.ClusterStatusUpdating &&
23+
(newSpec.Spec.Lifecycle == nil || newSpec.Spec.Lifecycle.Phase != "stopped")
24+
25+
// FIX C: Additional wake-up detection with simpler condition
26+
// If lifecycle was cleared and we have previousNumberOfInstances and numberOfInstances is 0
27+
isWakingUpSimple := newSpec.Spec.Lifecycle == nil || newSpec.Spec.Lifecycle.Phase != "stopped"
28+
hasPreviousInstances := newSpec.Status.PreviousNumberOfInstances > 0
29+
needsRestore := newSpec.Spec.NumberOfInstances == 0
30+
31+
isWakingUp = isWakingUp || (isWakingUpSimple && hasPreviousInstances && needsRestore)
32+
33+
// === INITIATE HIBERNATE: Running -> Stopping ===
34+
// Only initiate if not already stopping or stopped, and lifecycle.phase = "stopped"
35+
if newSpec.Spec.Lifecycle != nil &&
36+
newSpec.Spec.Lifecycle.Phase == "stopped" &&
37+
!newSpec.Status.Stopping() &&
38+
!newSpec.Status.Stopped() {
39+
40+
newSpec.Status.PreviousNumberOfInstances = newSpec.Spec.NumberOfInstances
41+
newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusStopping
42+
newSpec.Spec.NumberOfInstances = 0
43+
c.logger.Infof("[lifecycle] cluster is going to hibernate, stored previous number of instances: %d",
44+
newSpec.Status.PreviousNumberOfInstances)
45+
return true
46+
}
47+
48+
// === STOPPING -> STOPPED: Check actual StatefulSet replicas ===
49+
// Only transition to Stopped when StatefulSet replicas have actually reached 0
50+
if newSpec.Status.Stopping() {
51+
if c.Statefulset != nil && *c.Statefulset.Spec.Replicas == 0 {
52+
newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusStopped
53+
c.logger.Infof("[lifecycle] cluster has stopped, all pods are terminated")
54+
}
55+
return true
56+
}
57+
58+
// === WAKE-UP: Stopped/Updating -> Running ===
59+
// Restore numberOfInstances from previousNumberOfInstances when waking up
60+
if newSpec.Status.Stopped() || isWakingUp {
61+
// Check if lifecycle.phase was cleared (user wants to wake up)
62+
if isWakingUp || newSpec.Spec.Lifecycle == nil || newSpec.Spec.Lifecycle.Phase != "stopped" {
63+
if newSpec.Status.PreviousNumberOfInstances > 0 {
64+
newSpec.Spec.NumberOfInstances = newSpec.Status.PreviousNumberOfInstances
65+
c.logger.Infof("[lifecycle] cluster is waking up, restoring number of instances: %d",
66+
newSpec.Status.PreviousNumberOfInstances)
67+
} else {
68+
c.logger.Warningf("[lifecycle] cluster is waking up but previousNumberOfInstances is 0, cannot restore")
69+
}
70+
newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusUpdating
71+
return true
72+
}
73+
// Still stopped and lifecycle.phase = "stopped", skip further sync
74+
return false
75+
}
76+
77+
return true
78+
}

pkg/cluster/sync.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ func (c *Cluster) Sync(newSpec *acidv1.Postgresql) error {
4747
if err != nil {
4848
c.logger.Warningf("error while syncing cluster state: %v", err)
4949
newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusSyncFailed
50-
} else if !c.Status.Running() {
50+
} else if !c.Status.Running() && !c.Status.Stopping() && !c.Status.Stopped() {
5151
newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusRunning
5252
}
5353

@@ -65,6 +65,11 @@ func (c *Cluster) Sync(newSpec *acidv1.Postgresql) error {
6565
c.logger.Debugf("could not sync finalizers: %v", err)
6666
}
6767

68+
// Handle lifecycle hibernate/wake-up state transitions
69+
if !c.manageHibernateState(oldSpec, newSpec) {
70+
return nil
71+
}
72+
6873
if err = c.initUsers(); err != nil {
6974
err = fmt.Errorf("could not init users: %v", err)
7075
return err

pkg/util/k8sutil/k8sutil.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,16 @@ func (client *KubernetesClient) SetPostgresCRDStatus(clusterName spec.Namespaced
200200
return pg, nil
201201
}
202202

203+
// UpdatePostgresCR of Postgres cluster (updates full resource including spec)
204+
func (client *KubernetesClient) UpdatePostgresCR(clusterName spec.NamespacedName, pg *apiacidv1.Postgresql) (*apiacidv1.Postgresql, error) {
205+
pg, err := client.PostgresqlsGetter.Postgresqls(clusterName.Namespace).Update(context.TODO(), pg, metav1.UpdateOptions{})
206+
if err != nil {
207+
return pg, fmt.Errorf("could not update PostgresCR: %v", err)
208+
}
209+
210+
return pg, nil
211+
}
212+
203213
// SetFinalizer of Postgres cluster
204214
func (client *KubernetesClient) SetFinalizer(clusterName spec.NamespacedName, pg *apiacidv1.Postgresql, finalizers []string) (*apiacidv1.Postgresql, error) {
205215
var (

0 commit comments

Comments
 (0)