Skip to content

Commit 40b5674

Browse files
authored
Merge pull request #138 from cybertec-postgresql/optimizeMajorUpgradeTrigger
Optimize major upgrade trigger
2 parents aa3e738 + 9d7f2fa commit 40b5674

5 files changed

Lines changed: 83 additions & 25 deletions

File tree

pkg/cluster/cluster.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1251,13 +1251,18 @@ func (c *Cluster) Update(oldSpec, newSpec *cpov1.Postgresql) error {
12511251
}
12521252

12531253
if !updateFailed {
1254-
// Major version upgrade must only fire after success of earlier operations and should stay last
1255-
if err := c.majorVersionUpgrade(); err != nil {
1256-
c.logger.Errorf("major version upgrade failed: %v", err)
1254+
if upgradeErr := c.executeMajorVersionUpgrade(); upgradeErr != nil {
1255+
c.logger.Errorf("major version upgrade failed: %v", upgradeErr)
12571256
updateFailed = true
12581257
}
12591258
}
12601259

1260+
if updateFailed {
1261+
c.logger.Errorf("Update for cluster %s/%s finished with errors..", c.Namespace, c.Name)
1262+
} else {
1263+
c.logger.Infof("Update for cluster %s/%s completed successfully.", c.Namespace, c.Name)
1264+
}
1265+
12611266
return nil
12621267
}
12631268

pkg/cluster/majorversionupgrade.go

Lines changed: 57 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@ package cluster
33
import (
44
"context"
55
"encoding/json"
6+
"errors"
67
"fmt"
78
"strings"
9+
"time"
810

911
"github.com/Masterminds/semver"
1012
"github.com/cybertec-postgresql/cybertec-pg-operator/pkg/spec"
@@ -29,6 +31,8 @@ const (
2931
majorVersionUpgradeFailureAnnotation = "last-major-upgrade-failure"
3032
)
3133

34+
var errUpgradePrepNotReady = errors.New("cluster not ready for upgrade")
35+
3236
// IsBiggerPostgresVersion Compare two Postgres version numbers
3337
func IsBiggerPostgresVersion(old string, new string) bool {
3438
oldN := VersionMap[old]
@@ -232,12 +236,14 @@ func (c *Cluster) majorVersionUpgrade() error {
232236
continue
233237
}
234238
if checkStreaming && member.State != "streaming" {
235-
c.logger.Infof("skipping major version upgrade, replica %s is not streaming from primary", member.Name)
236-
return nil
239+
// c.logger.Infof("skipping major version upgrade, replica %s is not streaming from primary", member.Name)
240+
// return nil
241+
return fmt.Errorf("%w: replica %s is not streaming (state: %s)", errUpgradePrepNotReady, member.Name, member.State)
237242
}
238243
if member.Lag > 16*1024*1024 {
239-
c.logger.Infof("skipping major version upgrade, replication lag on member %s is too high", member.Name)
240-
return nil
244+
// c.logger.Infof("skipping major version upgrade, replication lag on member %s is too high", member.Name)
245+
// return nil
246+
return fmt.Errorf("%w: replication lag on member %s is too high (%d bytes)", errUpgradePrepNotReady, member.Name, member.Lag)
241247
}
242248
}
243249

@@ -246,11 +252,10 @@ func (c *Cluster) majorVersionUpgrade() error {
246252
if allRunning {
247253
c.logger.Infof("healthy cluster ready to upgrade, current: %d desired: %d", c.currentMajorVersion, desiredVersion)
248254
if c.currentMajorVersion < desiredVersion {
249-
defer func() error {
250-
if err = c.criticalOperationLabel(pods, nil); err != nil {
251-
return fmt.Errorf("failed to remove critical-operation label: %s", err)
255+
defer func() {
256+
if err := c.criticalOperationLabel(pods, nil); err != nil {
257+
c.logger.Errorf("failed to remove critical-operation label: %v", err)
252258
}
253-
return nil
254259
}()
255260
val := "true"
256261
if err = c.criticalOperationLabel(pods, &val); err != nil {
@@ -260,37 +265,72 @@ func (c *Cluster) majorVersionUpgrade() error {
260265
podName := &spec.NamespacedName{Namespace: masterPod.Namespace, Name: masterPod.Name}
261266
c.logger.Infof("triggering major version upgrade on pod %s of %d pods", masterPod.Name, numberOfPods)
262267
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Major Version Upgrade", "starting major version upgrade on pod %s of %d pods", masterPod.Name, numberOfPods)
263-
upgradeCommand := fmt.Sprintf("set -o pipefail && /usr/bin/python3 /scripts/inplace_upgrade.py %d 2>&1 | tee last_upgrade.log", numberOfPods)
264-
265-
c.logger.Debug("checking if the spilo image runs with root or non-root (check for user id=0)")
268+
upgradeCommand := fmt.Sprintf("/usr/local/bin/python3 /scripts/inplace_upgrade.py %d 2>&1", numberOfPods)
269+
c.logger.Debug("checking if the container runs with root or non-root (check for user id=0)")
266270
resultIdCheck, errIdCheck := c.ExecCommand(podName, "/bin/bash", "-c", "/usr/bin/id -u")
267271
if errIdCheck != nil {
268272
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Major Version Upgrade", "checking user id to run upgrade from %d to %d FAILED: %v", c.currentMajorVersion, desiredVersion, errIdCheck)
269273
}
270274

271275
resultIdCheck = strings.TrimSuffix(resultIdCheck, "\n")
272-
var result, scriptErrMsg string
276+
var result string
277+
273278
if resultIdCheck != "0" {
274279
c.logger.Infof("user id was identified as: %s, hence default user is non-root already", resultIdCheck)
275280
result, err = c.ExecCommand(podName, "/bin/bash", "-c", upgradeCommand)
276-
scriptErrMsg, _ = c.ExecCommand(podName, "/bin/bash", "-c", "tail -n 1 last_upgrade.log")
277281
} else {
278282
c.logger.Infof("user id was identified as: %s, using su to reach the postgres user", resultIdCheck)
279283
result, err = c.ExecCommand(podName, "/bin/su", "postgres", "-c", upgradeCommand)
280-
scriptErrMsg, _ = c.ExecCommand(podName, "/bin/bash", "-c", "tail -n 1 last_upgrade.log")
281284
}
285+
282286
if err != nil {
283287
isUpgradeSuccess = false
284288
c.annotatePostgresResource(isUpgradeSuccess)
285-
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Major Version Upgrade", "upgrade from %d to %d FAILED: %v", c.currentMajorVersion, desiredVersion, scriptErrMsg)
286-
return fmt.Errorf("%s", scriptErrMsg)
289+
290+
finalErrorMsg := strings.TrimSpace(result)
291+
if finalErrorMsg == "" {
292+
finalErrorMsg = err.Error()
293+
}
294+
295+
lines := strings.Split(finalErrorMsg, "\n")
296+
if len(lines) > 5 {
297+
finalErrorMsg = strings.Join(lines[len(lines)-5:], " | ")
298+
}
299+
300+
c.logger.Errorf("Major upgrade failed: %v", err)
301+
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Major Version Upgrade", "upgrade from %d to %d FAILED: %s", c.currentMajorVersion, desiredVersion, finalErrorMsg)
302+
303+
return fmt.Errorf("upgrade script failed: %s", finalErrorMsg)
287304
}
288305

289306
c.annotatePostgresResource(isUpgradeSuccess)
290307
c.logger.Infof("upgrade action triggered and command completed: %s", result[:100])
291-
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Major Version Upgrade", "upgrade from %d to %d finished", c.currentMajorVersion, desiredVersion)
308+
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Major Version Upgrade", "major version upgrade from version %d to version %d was successfully completed.", c.currentMajorVersion, desiredVersion)
292309
}
293310
}
294311

295312
return nil
296313
}
314+
315+
func (c *Cluster) executeMajorVersionUpgrade() error {
316+
maxRetries := 6
317+
var lastErr error
318+
319+
for i := 0; i < maxRetries; i++ {
320+
lastErr = c.majorVersionUpgrade()
321+
if lastErr == nil {
322+
return nil
323+
}
324+
325+
if errors.Is(lastErr, errUpgradePrepNotReady) {
326+
c.logger.Warnf("Major version upgrade deferred (attempt %d/%d): %v. Retrying in 15s...", i+1, maxRetries, lastErr)
327+
328+
if i < maxRetries-1 {
329+
time.Sleep(15 * time.Second)
330+
continue
331+
}
332+
}
333+
return lastErr
334+
}
335+
return lastErr
336+
}

pkg/cluster/sync.go

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ func generateSerialNumber() (*big.Int, error) {
172172
// Unlike the update, sync does not error out if some objects do not exist and takes care of creating them.
173173
func (c *Cluster) Sync(newSpec *cpov1.Postgresql) error {
174174
var err error
175+
syncFailed := false
175176
c.mu.Lock()
176177
defer c.mu.Unlock()
177178

@@ -326,9 +327,21 @@ func (c *Cluster) Sync(newSpec *cpov1.Postgresql) error {
326327
}
327328
}
328329

329-
// Major version upgrade must only run after success of all earlier operations, must remain last item in sync
330-
if err := c.majorVersionUpgrade(); err != nil {
331-
c.logger.Errorf("major version upgrade failed: %v", err)
330+
if err != nil {
331+
syncFailed = true
332+
}
333+
if !syncFailed {
334+
err = c.executeMajorVersionUpgrade()
335+
if err != nil {
336+
c.logger.Errorf("major version upgrade failed after retries: %v", err)
337+
syncFailed = true
338+
}
339+
}
340+
341+
if syncFailed {
342+
c.logger.Errorf("Update for cluster %s/%s finished with errors..", c.Namespace, c.Name)
343+
} else {
344+
c.logger.Infof("Update for cluster %s/%s completed successfully.", c.Namespace, c.Name)
332345
}
333346

334347
return err

ui/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,4 +38,4 @@ ARG VERSION=dev
3838
RUN sed -i "s/__version__ = .*/__version__ = '${VERSION}'/" /operator_ui/__init__.py
3939

4040
WORKDIR /
41-
CMD ["/usr/bin/python3", "-m", "operator_ui"]
41+
CMD ["/usr/local/bin/python3", "-m", "operator_ui"]

ui/start_server.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
#!/bin/bash
2-
/usr/bin/python3 -m operator_ui
2+
/usr/local/bin/python3 -m operator_ui

0 commit comments

Comments
 (0)