diff --git a/control-plane-operator/hostedclusterconfigoperator/controllers/inplaceupgrader/inplaceupgrader.go b/control-plane-operator/hostedclusterconfigoperator/controllers/inplaceupgrader/inplaceupgrader.go index 944a98117e0..ed5acab0af5 100644 --- a/control-plane-operator/hostedclusterconfigoperator/controllers/inplaceupgrader/inplaceupgrader.go +++ b/control-plane-operator/hostedclusterconfigoperator/controllers/inplaceupgrader/inplaceupgrader.go @@ -147,7 +147,14 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco return ctrl.Result{}, fmt.Errorf("token secret %s/%s is missing %q key", tokenSecret.Namespace, tokenSecret.Name, TokenSecretReleaseVersionKey) } - return ctrl.Result{}, r.reconcileInPlaceUpgrade(ctx, nodePoolUpgradeAPI, tokenSecret, mcoImage, releaseVersion) + if err := r.reconcileInPlaceUpgrade(ctx, nodePoolUpgradeAPI, tokenSecret, mcoImage, releaseVersion); err != nil { + return ctrl.Result{}, err + } + // Requeue periodically while an upgrade is in progress. The controller only + // watches Nodes and MachineSets, so if an upgrade pod is force-deleted the + // deletion event is missed and the replacement pod is never created. A + // periodic recheck closes that gap. + return ctrl.Result{RequeueAfter: 30 * time.Second}, nil } type nodePoolUpgradeAPI struct { @@ -277,7 +284,7 @@ func (r *Reconciler) reconcileInPlaceUpgrade(ctx context.Context, nodePoolUpgrad err = r.reconcileUpgradePods(ctx, r.guestClusterClient, nodes, nodePoolUpgradeAPI.spec.poolRef.GetName(), mcoImage, nodePoolUpgradeAPI.proxy) if err != nil { - return fmt.Errorf("failed to delete idle upgrade pods: %w", err) + return fmt.Errorf("failed to reconcile upgrade pods: %w", err) } return nil } @@ -312,7 +319,8 @@ func (r *Reconciler) reconcileUpgradePods(ctx context.Context, hostedClusterClie pod := inPlaceUpgradePod(namespace.Name, node.Name) if node.Annotations[CurrentMachineConfigAnnotationKey] == node.Annotations[DesiredMachineConfigAnnotationKey] && - node.Annotations[DesiredDrainerAnnotationKey] == node.Annotations[LastAppliedDrainerAnnotationKey] { + node.Annotations[DesiredDrainerAnnotationKey] == node.Annotations[LastAppliedDrainerAnnotationKey] && + node.Annotations[MachineConfigDaemonStateAnnotationKey] == MachineConfigDaemonStateDone { // the node is updated and does not require a MCD running if err := hostedClusterClient.Get(ctx, client.ObjectKeyFromObject(pod), pod); err != nil { if apierrors.IsNotFound(err) { @@ -349,6 +357,18 @@ func (r *Reconciler) reconcileUpgradePods(ctx context.Context, hostedClusterClie } else { log.Info("create upgrade pod", "result", result) } + } else if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed { + if pod.DeletionTimestamp != nil { + continue + } + log.Info("Detected terminated upgrade pod on node that still needs upgrade, deleting for retry", + "node", node.Name, "podPhase", pod.Status.Phase) + if err := hostedClusterClient.Delete(ctx, pod); err != nil { + if apierrors.IsNotFound(err) { + continue + } + return fmt.Errorf("error deleting terminated upgrade MCD pod for node %s: %w", node.Name, err) + } } } } diff --git a/control-plane-operator/hostedclusterconfigoperator/controllers/inplaceupgrader/inplaceupgrader_test.go b/control-plane-operator/hostedclusterconfigoperator/controllers/inplaceupgrader/inplaceupgrader_test.go index a2f28becfa4..d4f0a9aa4de 100644 --- a/control-plane-operator/hostedclusterconfigoperator/controllers/inplaceupgrader/inplaceupgrader_test.go +++ b/control-plane-operator/hostedclusterconfigoperator/controllers/inplaceupgrader/inplaceupgrader_test.go @@ -2,6 +2,7 @@ package inplaceupgrader import ( "testing" + "time" . "github.com/onsi/gomega" @@ -11,6 +12,7 @@ import ( configv1 "github.com/openshift/api/config/v1" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes/scheme" "k8s.io/utils/ptr" @@ -590,6 +592,220 @@ func TestCreateUpgradePod(t *testing.T) { } } +func TestReconcileUpgradePods(t *testing.T) { + _ = capiv1.AddToScheme(scheme.Scheme) + + poolName := "test-pool" + mcoImage := "mco-image:latest" + namespace := inPlaceUpgradeNamespace(poolName) + targetConfig := "target-hash" + + testCases := []struct { + name string + node *corev1.Node + existingPod *corev1.Pod + expectPodDeleted bool + expectPodCreated bool + expectPodRetained bool + expectPodSkipped bool + }{ + { + name: "When MCD pod is in Succeeded phase on a node needing upgrade it should delete the terminated pod", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: "old-hash", + DesiredMachineConfigAnnotationKey: targetConfig, + }, + }, + }, + existingPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace.Name, + Name: "machine-config-daemon-node1", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodSucceeded, + }, + }, + expectPodDeleted: true, + }, + { + name: "When MCD pod is in Failed phase on a node needing upgrade it should delete the terminated pod", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: "old-hash", + DesiredMachineConfigAnnotationKey: targetConfig, + }, + }, + }, + existingPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace.Name, + Name: "machine-config-daemon-node1", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodFailed, + }, + }, + expectPodDeleted: true, + }, + { + name: "When MCD pod is in Running phase on a node needing upgrade it should leave the pod alone", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: "old-hash", + DesiredMachineConfigAnnotationKey: targetConfig, + }, + }, + }, + existingPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace.Name, + Name: "machine-config-daemon-node1", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + }, + }, + expectPodRetained: true, + }, + { + name: "When no MCD pod exists on a node needing upgrade it should create a new pod", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: "old-hash", + DesiredMachineConfigAnnotationKey: targetConfig, + }, + }, + }, + existingPod: nil, + expectPodCreated: true, + }, + { + name: "When terminated MCD pod is already being deleted it should skip without error", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: "old-hash", + DesiredMachineConfigAnnotationKey: targetConfig, + }, + }, + }, + existingPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace.Name, + Name: "machine-config-daemon-node1", + DeletionTimestamp: &metav1.Time{Time: time.Now()}, + Finalizers: []string{"test-finalizer"}, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodFailed, + }, + }, + expectPodSkipped: true, + }, + { + name: "When MCD pod is in Succeeded phase on a fully updated node it should delete the idle pod", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: targetConfig, + DesiredMachineConfigAnnotationKey: targetConfig, + DesiredDrainerAnnotationKey: "uncordon-xxx", + LastAppliedDrainerAnnotationKey: "uncordon-xxx", + MachineConfigDaemonStateAnnotationKey: MachineConfigDaemonStateDone, + }, + }, + }, + existingPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace.Name, + Name: "machine-config-daemon-node1", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodSucceeded, + }, + }, + expectPodDeleted: true, + }, + { + name: "When node config matches but MCD state is not Done it should delete the terminated pod for retry", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: targetConfig, + DesiredMachineConfigAnnotationKey: targetConfig, + DesiredDrainerAnnotationKey: "uncordon-xxx", + LastAppliedDrainerAnnotationKey: "uncordon-xxx", + MachineConfigDaemonStateAnnotationKey: "Working", + }, + }, + }, + existingPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace.Name, + Name: "machine-config-daemon-node1", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodFailed, + }, + }, + expectPodDeleted: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + g := NewWithT(t) + + var guestObjects []client.Object + if tc.existingPod != nil { + guestObjects = append(guestObjects, tc.existingPod) + } + guestClient := fake.NewClientBuilder().WithScheme(scheme.Scheme). + WithObjects(guestObjects...).Build() + + r := &Reconciler{ + CreateOrUpdateProvider: upsert.New(false), + } + + err := r.reconcileUpgradePods(t.Context(), guestClient, []*corev1.Node{tc.node}, poolName, mcoImage, nil) + g.Expect(err).ToNot(HaveOccurred()) + + pod := inPlaceUpgradePod(namespace.Name, tc.node.Name) + getErr := guestClient.Get(t.Context(), client.ObjectKeyFromObject(pod), pod) + + if tc.expectPodDeleted { + g.Expect(apierrors.IsNotFound(getErr)).To(BeTrue(), "expected pod to be deleted, got: %v", getErr) + } + if tc.expectPodCreated { + g.Expect(getErr).ToNot(HaveOccurred(), "expected pod to be created") + g.Expect(pod.Spec.Containers).To(HaveLen(1)) + g.Expect(pod.Spec.Containers[0].Image).To(Equal(mcoImage)) + } + if tc.expectPodRetained { + g.Expect(getErr).ToNot(HaveOccurred(), "expected pod to be retained") + g.Expect(pod.Status.Phase).To(Equal(corev1.PodRunning)) + } + if tc.expectPodSkipped { + g.Expect(getErr).ToNot(HaveOccurred(), "expected pod to still exist (skip due to DeletionTimestamp)") + g.Expect(pod.DeletionTimestamp).ToNot(BeNil(), "expected DeletionTimestamp to remain from original object") + } + }) + } +} + func TestReconcileInPlaceUpgradeAnnotatesMachineWithNodePoolVersion(t *testing.T) { g := NewWithT(t) _ = capiv1.AddToScheme(scheme.Scheme)