@@ -3,8 +3,10 @@ package cluster
33import (
44 "context"
55 "encoding/json"
6+ "errors"
67 "fmt"
78 "strings"
9+ "time"
810
911 "github.com/Masterminds/semver"
1012 "github.com/cybertec-postgresql/cybertec-pg-operator/pkg/spec"
@@ -29,6 +31,8 @@ const (
2931 majorVersionUpgradeFailureAnnotation = "last-major-upgrade-failure"
3032)
3133
34+ var errUpgradePrepNotReady = errors .New ("cluster not ready for upgrade" )
35+
3236// IsBiggerPostgresVersion Compare two Postgres version numbers
3337func IsBiggerPostgresVersion (old string , new string ) bool {
3438 oldN := VersionMap [old ]
@@ -232,12 +236,14 @@ func (c *Cluster) majorVersionUpgrade() error {
232236 continue
233237 }
234238 if checkStreaming && member .State != "streaming" {
235- c .logger .Infof ("skipping major version upgrade, replica %s is not streaming from primary" , member .Name )
236- return nil
239+ // c.logger.Infof("skipping major version upgrade, replica %s is not streaming from primary", member.Name)
240+ // return nil
241+ return fmt .Errorf ("%w: replica %s is not streaming (state: %s)" , errUpgradePrepNotReady , member .Name , member .State )
237242 }
238243 if member .Lag > 16 * 1024 * 1024 {
239- c .logger .Infof ("skipping major version upgrade, replication lag on member %s is too high" , member .Name )
240- return nil
244+ // c.logger.Infof("skipping major version upgrade, replication lag on member %s is too high", member.Name)
245+ // return nil
246+ return fmt .Errorf ("%w: replication lag on member %s is too high (%d bytes)" , errUpgradePrepNotReady , member .Name , member .Lag )
241247 }
242248 }
243249
@@ -246,11 +252,10 @@ func (c *Cluster) majorVersionUpgrade() error {
246252 if allRunning {
247253 c .logger .Infof ("healthy cluster ready to upgrade, current: %d desired: %d" , c .currentMajorVersion , desiredVersion )
248254 if c .currentMajorVersion < desiredVersion {
249- defer func () error {
250- if err = c .criticalOperationLabel (pods , nil ); err != nil {
251- return fmt . Errorf ("failed to remove critical-operation label: %s " , err )
255+ defer func () {
256+ if err : = c .criticalOperationLabel (pods , nil ); err != nil {
257+ c . logger . Errorf ("failed to remove critical-operation label: %v " , err )
252258 }
253- return nil
254259 }()
255260 val := "true"
256261 if err = c .criticalOperationLabel (pods , & val ); err != nil {
@@ -260,37 +265,72 @@ func (c *Cluster) majorVersionUpgrade() error {
260265 podName := & spec.NamespacedName {Namespace : masterPod .Namespace , Name : masterPod .Name }
261266 c .logger .Infof ("triggering major version upgrade on pod %s of %d pods" , masterPod .Name , numberOfPods )
262267 c .eventRecorder .Eventf (c .GetReference (), v1 .EventTypeNormal , "Major Version Upgrade" , "starting major version upgrade on pod %s of %d pods" , masterPod .Name , numberOfPods )
263- upgradeCommand := fmt .Sprintf ("set -o pipefail && /usr/bin/python3 /scripts/inplace_upgrade.py %d 2>&1 | tee last_upgrade.log" , numberOfPods )
264-
265- c .logger .Debug ("checking if the spilo image runs with root or non-root (check for user id=0)" )
268+ upgradeCommand := fmt .Sprintf ("/usr/local/bin/python3 /scripts/inplace_upgrade.py %d 2>&1" , numberOfPods )
269+ c .logger .Debug ("checking if the container runs with root or non-root (check for user id=0)" )
266270 resultIdCheck , errIdCheck := c .ExecCommand (podName , "/bin/bash" , "-c" , "/usr/bin/id -u" )
267271 if errIdCheck != nil {
268272 c .eventRecorder .Eventf (c .GetReference (), v1 .EventTypeWarning , "Major Version Upgrade" , "checking user id to run upgrade from %d to %d FAILED: %v" , c .currentMajorVersion , desiredVersion , errIdCheck )
269273 }
270274
271275 resultIdCheck = strings .TrimSuffix (resultIdCheck , "\n " )
272- var result , scriptErrMsg string
276+ var result string
277+
273278 if resultIdCheck != "0" {
274279 c .logger .Infof ("user id was identified as: %s, hence default user is non-root already" , resultIdCheck )
275280 result , err = c .ExecCommand (podName , "/bin/bash" , "-c" , upgradeCommand )
276- scriptErrMsg , _ = c .ExecCommand (podName , "/bin/bash" , "-c" , "tail -n 1 last_upgrade.log" )
277281 } else {
278282 c .logger .Infof ("user id was identified as: %s, using su to reach the postgres user" , resultIdCheck )
279283 result , err = c .ExecCommand (podName , "/bin/su" , "postgres" , "-c" , upgradeCommand )
280- scriptErrMsg , _ = c .ExecCommand (podName , "/bin/bash" , "-c" , "tail -n 1 last_upgrade.log" )
281284 }
285+
282286 if err != nil {
283287 isUpgradeSuccess = false
284288 c .annotatePostgresResource (isUpgradeSuccess )
285- c .eventRecorder .Eventf (c .GetReference (), v1 .EventTypeWarning , "Major Version Upgrade" , "upgrade from %d to %d FAILED: %v" , c .currentMajorVersion , desiredVersion , scriptErrMsg )
286- return fmt .Errorf ("%s" , scriptErrMsg )
289+
290+ finalErrorMsg := strings .TrimSpace (result )
291+ if finalErrorMsg == "" {
292+ finalErrorMsg = err .Error ()
293+ }
294+
295+ lines := strings .Split (finalErrorMsg , "\n " )
296+ if len (lines ) > 5 {
297+ finalErrorMsg = strings .Join (lines [len (lines )- 5 :], " | " )
298+ }
299+
300+ c .logger .Errorf ("Major upgrade failed: %v" , err )
301+ c .eventRecorder .Eventf (c .GetReference (), v1 .EventTypeWarning , "Major Version Upgrade" , "upgrade from %d to %d FAILED: %s" , c .currentMajorVersion , desiredVersion , finalErrorMsg )
302+
303+ return fmt .Errorf ("upgrade script failed: %s" , finalErrorMsg )
287304 }
288305
289306 c .annotatePostgresResource (isUpgradeSuccess )
290307 c .logger .Infof ("upgrade action triggered and command completed: %s" , result [:100 ])
291- c .eventRecorder .Eventf (c .GetReference (), v1 .EventTypeNormal , "Major Version Upgrade" , "upgrade from %d to %d finished " , c .currentMajorVersion , desiredVersion )
308+ c .eventRecorder .Eventf (c .GetReference (), v1 .EventTypeNormal , "Major Version Upgrade" , "major version upgrade from version %d to version %d was successfully completed. " , c .currentMajorVersion , desiredVersion )
292309 }
293310 }
294311
295312 return nil
296313}
314+
315+ func (c * Cluster ) executeMajorVersionUpgrade () error {
316+ maxRetries := 6
317+ var lastErr error
318+
319+ for i := 0 ; i < maxRetries ; i ++ {
320+ lastErr = c .majorVersionUpgrade ()
321+ if lastErr == nil {
322+ return nil
323+ }
324+
325+ if errors .Is (lastErr , errUpgradePrepNotReady ) {
326+ c .logger .Warnf ("Major version upgrade deferred (attempt %d/%d): %v. Retrying in 15s..." , i + 1 , maxRetries , lastErr )
327+
328+ if i < maxRetries - 1 {
329+ time .Sleep (15 * time .Second )
330+ continue
331+ }
332+ }
333+ return lastErr
334+ }
335+ return lastErr
336+ }
0 commit comments