Skip to content

Commit 5d7d789

Browse files
OCPBUGS-81476: Fix race condition in PinnedImages GC test
This commit fixes a race condition in the "All Nodes in a custom Pool should have the PinnedImages even after Garbage Collection" test that caused nodes to get stuck in degraded state with missing MachineConfig. The Problem: The test was using defers in the wrong order, causing cleanup to happen like this: 1. Delete KubeletConfig 2. Delete PinnedImageSet (triggers rendered-custom deletion) 3. Unlabel node (triggers transition to worker pool) 4. Wait for worker config When step 3 triggered the transition, the node would reboot to apply the worker config. However, because the rendered-custom config was already deleted in step 2, the node would come back up with a reference to a non-existent config on disk and get stuck in degraded state: currentConfig: rendered-custom-d356ed29481f2de2bb31c6443e1d29ca desiredConfig: rendered-worker-82faad7319f9e10715adbfd98a4b67ba state: Degraded reason: "machineconfig 'rendered-custom-d356ed29481f2de2bb31c6443e1d29ca' not found" The Fix: Changed cleanup order to: 1. Unlabel node (triggers transition) 2. Wait for worker config transition to complete 3. Delete KubeletConfig 4. Delete PinnedImageSet This ensures the node successfully transitions back to the worker pool BEFORE we delete any configs, eliminating the race condition. Changes: - Removed defers for unlabelNode, waitTillNodeReadyWithConfig, deletePIS, and deleteKC - Added explicit cleanup after GCPISTest completes that performs operations in the correct order - Added logging to track cleanup progress - Removed defer deleteKC from GCPISTest function Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent c77ff4a commit 5d7d789

1 file changed

Lines changed: 17 additions & 4 deletions

File tree

test/extended/machine_config/pinnedimages.go

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,6 @@ var _ = g.Describe("[Suite:openshift/machine-config-operator/disruptive][sig-mco
9494
// Add node to custom MCP & wait for the node to be ready in the MCP
9595
optedNodes, err := addWorkerNodesToCustomPool(oc, kubeClient, 1, "custom")
9696
o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error adding node to `custom` MCP: %v", err))
97-
defer waitTillNodeReadyWithConfig(kubeClient, optedNodes[0], workerConfigPrefix)
98-
defer unlabelNode(oc, optedNodes[0])
9997
framework.Logf("Waiting for `%v` node to be ready in `custom` MCP.", optedNodes[0])
10098
waitTillNodeReadyWithConfig(kubeClient, optedNodes[0], customConfigPrefix)
10199

@@ -129,12 +127,28 @@ var _ = g.Describe("[Suite:openshift/machine-config-operator/disruptive][sig-mco
129127

130128
// Apply PIS
131129
defer deletePinnedImages(oc, kubeClient, clientSet, optedNodes, pinnedImages, isMetalDisconnected)
132-
defer deletePIS(oc, pis.Name)
133130
err = applyPIS(oc, pisFixture, pis, pisDiverged)
134131
o.Expect(err).NotTo(o.HaveOccurred(), "Applied PIS")
135132

136133
// Test the images applied in the PIS exist on the node after garbage collection.
137134
GCPISTest(oc, kubeClient, clientSet, true, optedNodes[0], kcFixture, gcImage, pis.Name, isMetalDisconnected)
135+
136+
// Cleanup: Transition node back to worker pool BEFORE deleting configs to avoid race condition.
137+
// If we delete the configs while the node is rebooting to transition back to worker pool,
138+
// the node may come up with a deleted rendered-custom config and get stuck in degraded state.
139+
framework.Logf("Cleaning up: Unlabeling node '%s' to move back to worker pool", optedNodes[0])
140+
err = unlabelNode(oc, optedNodes[0])
141+
o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error unlabeling node: %v", err))
142+
143+
framework.Logf("Waiting for node '%s' to transition to worker config", optedNodes[0])
144+
waitTillNodeReadyWithConfig(kubeClient, optedNodes[0], workerConfigPrefix)
145+
146+
// Now it's safe to delete the custom pool configs
147+
framework.Logf("Node transitioned successfully, deleting KubeletConfig and PinnedImageSet")
148+
err = deleteKC(oc, "custom-gc-config")
149+
o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error deleting KubeletConfig: %v", err))
150+
err = deletePIS(oc, pis.Name)
151+
o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error deleting PinnedImageSet: %v", err))
138152
})
139153

140154
g.It("All Nodes in a Custom Pool should have the PinnedImages in PIS [apigroup:machineconfiguration.openshift.io]", func() {
@@ -384,7 +398,6 @@ func addWorkerNodesToCustomPool(oc *exutil.CLI, kubeClient *kubernetes.Clientset
384398
// `GCPISTest` completes the body of a PIS test including the garbage collection step
385399
func GCPISTest(oc *exutil.CLI, kubeClient *kubernetes.Clientset, clientSet *mcClient.Clientset, success bool, nodeName, customGcKCFixture, gcImage, pisName string, isMetalDisconnected bool) {
386400
// Apply KC to Pool
387-
defer deleteKC(oc, "custom-gc-config")
388401
err := oc.Run("apply").Args("-f", customGcKCFixture).Execute()
389402
o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error applying garbage collection kubelet config: %s", err))
390403

0 commit comments

Comments
 (0)