Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion build/pxc-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,7 @@ if [ "$1" = 'mysqld' ] && [ -z "$wantHelp" ]; then
| sed 's/^[ \t]*//'
)"
wsrep_start_position_opt="--wsrep_start_position=$start_pos"
uuid=$(echo "$start_pos" | awk -F':' '{print $1}' || :)
seqno=$(echo "$start_pos" | awk -F':' '{print $NF}' || :)
else
# The server prints "..skipping position recovery.." if started without wsrep.
Expand Down Expand Up @@ -755,6 +756,9 @@ if [ "$1" = 'mysqld' ] && [ -z "$wantHelp" ]; then
|| [[ -z $is_primary_exists && -f $grastate_loc && $safe_to_bootstrap == 1 && -n ${CLUSTER_JOIN} ]]; then
trap '{ node_recovery "$@" ; }' USR1
touch /tmp/recovery-case
if [[ -z ${uuid} ]]; then
uuid="00000000-0000-0000-0000-000000000000"
fi
if [[ -z ${seqno} ]]; then
seqno="-1"
fi
Expand All @@ -765,12 +769,13 @@ if [ "$1" = 'mysqld' ] && [ -z "$wantHelp" ]; then
echo "#####################################################FULL_PXC_CLUSTER_CRASH:$NODE_NAME#####################################################"
echo 'You have the situation of a full PXC cluster crash. In order to restore your PXC cluster, please check the log'
echo 'from all pods/nodes to find the node with the most recent data (the one with the highest sequence number (seqno).'
echo "Cluster UUID: $uuid"
echo "It is $NODE_NAME node with sequence number (seqno): $seqno"
echo 'Cluster will recover automatically from the crash now.'
echo 'If you have set spec.pxc.autoRecovery to false, run the following command to recover manually from this node:'
echo "kubectl -n $POD_NAMESPACE exec $(hostname) -c pxc -- sh -c 'kill -s USR1 1'"
#DO NOT CHANGE THE LINE BELOW. OUR AUTO-RECOVERY IS USING IT TO DETECT SEQNO OF CURRENT NODE. See K8SPXC-564
echo "#####################################################LAST_LINE:$NODE_NAME:$seqno:#####################################################"
echo "#####################################################LAST_LINE:$NODE_NAME:$uuid:$seqno:#####################################################"

for (( ; ; )); do
is_primary_exists=$(get_primary)
Expand Down
13 changes: 13 additions & 0 deletions config/crd/bases/pxc.percona.com_perconaxtradbclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11035,6 +11035,19 @@ spec:
ready:
format: int32
type: integer
recovery:
properties:
clusterUUID:
type: string
lastRecoveryPod:
type: string
lastRecoverySeqNo:
format: int64
type: integer
lastRecoveryTime:
format: date-time
type: string
type: object
size:
format: int32
type: integer
Expand Down
13 changes: 13 additions & 0 deletions deploy/bundle.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12365,6 +12365,19 @@ spec:
ready:
format: int32
type: integer
recovery:
properties:
clusterUUID:
type: string
lastRecoveryPod:
type: string
lastRecoverySeqNo:
format: int64
type: integer
lastRecoveryTime:
format: date-time
type: string
type: object
size:
format: int32
type: integer
Expand Down
13 changes: 13 additions & 0 deletions deploy/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12365,6 +12365,19 @@ spec:
ready:
format: int32
type: integer
recovery:
properties:
clusterUUID:
type: string
lastRecoveryPod:
type: string
lastRecoverySeqNo:
format: int64
type: integer
lastRecoveryTime:
format: date-time
type: string
type: object
size:
format: int32
type: integer
Expand Down
13 changes: 13 additions & 0 deletions deploy/cw-bundle.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12365,6 +12365,19 @@ spec:
ready:
format: int32
type: integer
recovery:
properties:
clusterUUID:
type: string
lastRecoveryPod:
type: string
lastRecoverySeqNo:
format: int64
type: integer
lastRecoveryTime:
format: date-time
type: string
type: object
size:
format: int32
type: integer
Expand Down
2 changes: 0 additions & 2 deletions e2e-tests/tls-issue-cert-manager/run
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,6 @@ main() {
kubectl_bin delete pods -l app.kubernetes.io/instance=$cluster,app.kubernetes.io/managed-by=percona-xtradb-cluster-operator --force --grace-period=0

desc 'wait for cluster to recover after full restart'
wait_for_running "$cluster-haproxy" 1
wait_for_running "$cluster-pxc" 3
Comment on lines -152 to -153
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this related to the changes in this PR?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not really but tls-issue-cert-manager failure was blocking this PR.

the issue is introduced by 7f885a1. probably it slipped away in one of the retries and we merged it. in the test, to rotate the certificates, we delete pods altogether then run wait_for_running which run wait_pod which checks for full cluster crash and exits immediately if it detects crash. but we're causing the crash deliberately. so i removed these and opted for waiting cluster readiness only.

wait_cluster_consistency "$cluster" 3 2

desc 'check ssl-internal certificate using PXC after CA rotation'
Expand Down
24 changes: 24 additions & 0 deletions pkg/apis/pxc/v1/pxc_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,7 @@ type PerconaXtraDBClusterStatus struct {
Backup ComponentStatus `json:"backup,omitempty"`
PMM ComponentStatus `json:"pmm,omitempty"`
LogCollector ComponentStatus `json:"logcollector,omitempty"`
Recovery *RecoveryStatus `json:"recovery,omitempty"`
Host string `json:"host,omitempty"`
Messages []string `json:"message,omitempty"`
Status AppState `json:"state,omitempty"`
Expand Down Expand Up @@ -376,6 +377,29 @@ type AppStatus struct {
Ready int32 `json:"ready,omitempty"`
}

// RecoveryStatus records the outcome of the most recent full-cluster-crash
// recovery. It is consulted on subsequent crashes to decide whether automatic
// recovery is safe: a UUID change or seqno regression indicates the operator
// would be bootstrapping from a node with stale or unrelated data, so manual
// intervention is required.
type RecoveryStatus struct {
// ClusterUUID is the Galera cluster UUID reported by the pod the operator
// recovered from. The all-zeros UUID means the pod's grastate.dat had no
// recoverable UUID (uninitialized or reset). An empty value means the log
// line did not include a UUID (PXC entrypoint <1.20.0).
ClusterUUID string `json:"clusterUUID,omitempty"`
// LastRecoveryTime is when the operator triggered the most recent
// full-cluster-crash recovery.
LastRecoveryTime metav1.Time `json:"lastRecoveryTime,omitempty"`
// LastRecoveryPod is the pod the operator picked to bootstrap from
// (the one with the highest reported seqno).
LastRecoveryPod string `json:"lastRecoveryPod,omitempty"`
// LastRecoverySeqNo is the wsrep sequence number of the pod that was
// used to bootstrap. A subsequent recovery with a lower seqno is refused
// automatically, since proceeding would discard committed transactions.
LastRecoverySeqNo int64 `json:"lastRecoverySeqNo,omitempty"`
}

// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object

// PerconaXtraDBCluster is the Schema for the perconaxtradbclusters API
Expand Down
21 changes: 21 additions & 0 deletions pkg/apis/pxc/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading