Skip to content

Commit 2a8032f

Browse files
K8SPG-374: detect replication lag on standby (#1407)
Signed-off-by: Mayank Shah <mayank.shah@percona.com>
1 parent dae160f commit 2a8032f

52 files changed

Lines changed: 2512 additions & 13 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

build/crd/percona/generated/pgv2.percona.com_perconapgclusters.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21028,6 +21028,17 @@ spec:
2102821028
description: Network address of the PostgreSQL server to follow
2102921029
via streaming replication.
2103021030
type: string
21031+
maxAcceptableLag:
21032+
anyOf:
21033+
- type: integer
21034+
- type: string
21035+
description: |-
21036+
MaxAcceptableLag is the maximum WAL lag allowed for the standby cluster, measured in bytes of WAL data.
21037+
This represents the maximum amount of WAL data that the standby can be behind the primary.
21038+
If the lag exceeds this value, the standby cluster is marked as unready.
21039+
If unset, lag is not checked.
21040+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
21041+
x-kubernetes-int-or-string: true
2103121042
port:
2103221043
description: Network port of the PostgreSQL server to follow via
2103321044
streaming replication.
@@ -21461,6 +21472,15 @@ spec:
2146121472
version:
2146221473
type: integer
2146321474
type: object
21475+
standby:
21476+
properties:
21477+
lagBytes:
21478+
format: int64
21479+
type: integer
21480+
lagLastComputedAt:
21481+
format: date-time
21482+
type: string
21483+
type: object
2146421484
state:
2146521485
type: string
2146621486
type: object

build/postgres-operator/postgres-readiness-check.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,12 @@
33
PATRONI_PORT=8008
44
PATRONI_HOST=localhost
55

6-
response=$(curl -s -o /dev/null -w "%{http_code}" -k "https://${PATRONI_HOST}:${PATRONI_PORT}/readiness")
6+
if [[ -f /pgdata/replication-lag-detected ]]; then
7+
echo "Replication lag detected, pod is not ready"
8+
exit 1
9+
fi
710

11+
response=$(curl -s -o /dev/null -w "%{http_code}" -k "https://${PATRONI_HOST}:${PATRONI_PORT}/readiness")
812
if [[ $response -eq 200 ]]; then
913
exit 0
1014
fi

cmd/postgres-operator/main.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ import (
4141
"github.com/percona/percona-postgresql-operator/v2/percona/controller/pgcluster"
4242
"github.com/percona/percona-postgresql-operator/v2/percona/controller/pgrestore"
4343
perconaPGUpgrade "github.com/percona/percona-postgresql-operator/v2/percona/controller/pgupgrade"
44+
"github.com/percona/percona-postgresql-operator/v2/percona/k8s"
4445
perconaRuntime "github.com/percona/percona-postgresql-operator/v2/percona/runtime"
4546
"github.com/percona/percona-postgresql-operator/v2/percona/utils/registry"
4647
v2 "github.com/percona/percona-postgresql-operator/v2/pkg/apis/pgv2.percona.com/v2"
@@ -189,7 +190,14 @@ func addControllersToManager(ctx context.Context, mgr manager.Manager) error {
189190
StopExternalWatchers: stopChan,
190191
Watchers: registry.New(),
191192
}
192-
if err := pc.SetupWithManager(mgr); err != nil {
193+
194+
if namespaces, err := k8s.GetWatchNamespace(); err != nil {
195+
return errors.Wrap(err, "check if watching multi namespace")
196+
} else {
197+
pc.WatchNamespace = strings.Split(namespaces, ",")
198+
}
199+
200+
if err := pc.SetupWithManager(ctx, mgr); err != nil {
193201
return err
194202
}
195203

config/crd/bases/pgv2.percona.com_perconapgclusters.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21433,6 +21433,17 @@ spec:
2143321433
description: Network address of the PostgreSQL server to follow
2143421434
via streaming replication.
2143521435
type: string
21436+
maxAcceptableLag:
21437+
anyOf:
21438+
- type: integer
21439+
- type: string
21440+
description: |-
21441+
MaxAcceptableLag is the maximum WAL lag allowed for the standby cluster, measured in bytes of WAL data.
21442+
This represents the maximum amount of WAL data that the standby can be behind the primary.
21443+
If the lag exceeds this value, the standby cluster is marked as unready.
21444+
If unset, lag is not checked.
21445+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
21446+
x-kubernetes-int-or-string: true
2143621447
port:
2143721448
description: Network port of the PostgreSQL server to follow via
2143821449
streaming replication.
@@ -21866,6 +21877,15 @@ spec:
2186621877
version:
2186721878
type: integer
2186821879
type: object
21880+
standby:
21881+
properties:
21882+
lagBytes:
21883+
format: int64
21884+
type: integer
21885+
lagLastComputedAt:
21886+
format: date-time
21887+
type: string
21888+
type: object
2186921889
state:
2187021890
type: string
2187121891
type: object

deploy/bundle.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21730,6 +21730,17 @@ spec:
2173021730
description: Network address of the PostgreSQL server to follow
2173121731
via streaming replication.
2173221732
type: string
21733+
maxAcceptableLag:
21734+
anyOf:
21735+
- type: integer
21736+
- type: string
21737+
description: |-
21738+
MaxAcceptableLag is the maximum WAL lag allowed for the standby cluster, measured in bytes of WAL data.
21739+
This represents the maximum amount of WAL data that the standby can be behind the primary.
21740+
If the lag exceeds this value, the standby cluster is marked as unready.
21741+
If unset, lag is not checked.
21742+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
21743+
x-kubernetes-int-or-string: true
2173321744
port:
2173421745
description: Network port of the PostgreSQL server to follow via
2173521746
streaming replication.
@@ -22163,6 +22174,15 @@ spec:
2216322174
version:
2216422175
type: integer
2216522176
type: object
22177+
standby:
22178+
properties:
22179+
lagBytes:
22180+
format: int64
22181+
type: integer
22182+
lagLastComputedAt:
22183+
format: date-time
22184+
type: string
22185+
type: object
2216622186
state:
2216722187
type: string
2216822188
type: object

deploy/cr.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ spec:
6363
# host: "<primary-ip>"
6464
# port: "<primary-port>"
6565
# repoName: repo1
66+
# maxAcceptableLag: 1Gi
6667

6768
# openshift: true
6869

deploy/crd.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21730,6 +21730,17 @@ spec:
2173021730
description: Network address of the PostgreSQL server to follow
2173121731
via streaming replication.
2173221732
type: string
21733+
maxAcceptableLag:
21734+
anyOf:
21735+
- type: integer
21736+
- type: string
21737+
description: |-
21738+
MaxAcceptableLag is the maximum WAL lag allowed for the standby cluster, measured in bytes of WAL data.
21739+
This represents the maximum amount of WAL data that the standby can be behind the primary.
21740+
If the lag exceeds this value, the standby cluster is marked as unready.
21741+
If unset, lag is not checked.
21742+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
21743+
x-kubernetes-int-or-string: true
2173321744
port:
2173421745
description: Network port of the PostgreSQL server to follow via
2173521746
streaming replication.
@@ -22163,6 +22174,15 @@ spec:
2216322174
version:
2216422175
type: integer
2216522176
type: object
22177+
standby:
22178+
properties:
22179+
lagBytes:
22180+
format: int64
22181+
type: integer
22182+
lagLastComputedAt:
22183+
format: date-time
22184+
type: string
22185+
type: object
2216622186
state:
2216722187
type: string
2216822188
type: object

deploy/cw-bundle.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21730,6 +21730,17 @@ spec:
2173021730
description: Network address of the PostgreSQL server to follow
2173121731
via streaming replication.
2173221732
type: string
21733+
maxAcceptableLag:
21734+
anyOf:
21735+
- type: integer
21736+
- type: string
21737+
description: |-
21738+
MaxAcceptableLag is the maximum WAL lag allowed for the standby cluster, measured in bytes of WAL data.
21739+
This represents the maximum amount of WAL data that the standby can be behind the primary.
21740+
If the lag exceeds this value, the standby cluster is marked as unready.
21741+
If unset, lag is not checked.
21742+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
21743+
x-kubernetes-int-or-string: true
2173321744
port:
2173421745
description: Network port of the PostgreSQL server to follow via
2173521746
streaming replication.
@@ -22163,6 +22174,15 @@ spec:
2216322174
version:
2216422175
type: integer
2216522176
type: object
22177+
standby:
22178+
properties:
22179+
lagBytes:
22180+
format: int64
22181+
type: integer
22182+
lagLastComputedAt:
22183+
format: date-time
22184+
type: string
22185+
type: object
2216622186
state:
2216722187
type: string
2216822188
type: object

e2e-tests/functions

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,7 @@ get_cr() {
304304
fi
305305

306306
case $test_name in
307-
"demand-backup" | "start-from-backup")
307+
"demand-backup" | "start-from-backup" | "standby-pgbackrest")
308308
yq eval -i '
309309
.spec.backups.pgbackrest.configuration = [{"secret":{"name":"'${test_name}'-pgbackrest-secrets"}}] |
310310
.spec.backups.pgbackrest.manual.repoName = "repo1" |

e2e-tests/run-pr.csv

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ scaling
1616
scheduled-backup
1717
self-healing
1818
sidecars
19+
standby-pgbackrest
20+
standby-streaming
1921
start-from-backup
2022
tablespaces
2123
telemetry-transfer

0 commit comments

Comments
 (0)