[multiple] Fix MCO stuck-uncordon deadlock

vakwetu · claude · openshift-merge-bot[bot] · commit 79aaded2d444 · 2026-03-27T19:41:09.000Z
MachineConfigs applied during devscripts install trigger an MCO update
cycle that runs asynchronously after the cluster becomes reachable.  On
compact 3-master clusters the MCO controller can enter a permanent
deadlock: all nodes reboot, apply the new config, and report
state=Done with desiredDrain=lastAppliedDrain=uncordon-*, but the
controller never issues the final kubectl uncordon.  This leaves all
nodes SchedulingDisabled indefinitely, causing every subsequent cluster
operator to degrade and the deployment to time out.

Add a retry loop in wait_for_cluster.yml (run as part of the
openshift_adm 'stable' operation after devscripts post-install) that:

- Polls MachineConfigPool status every 30 s for up to 30 minutes.
- If a pool is updating normally (nodes being drained/rebooted in
  sequence) it waits without interrupting the MCO mid-cycle.
- If it detects the stuck state (updatedMachineCount == machineCount
  but readyMachineCount == 0) it runs 'oc adm uncordon' on all nodes
  to break the deadlock, then continues polling.
- Only proceeds to 'oc adm wait-for-stable-cluster' once all pools
  report Updated=True.

Signed-off-by: Ade Lee &lt;alee@redhat.com&gt;
Co-Authored-By: Claude &lt;noreply@anthropic.com&gt;
diff --git a/roles/openshift_adm/tasks/wait_for_cluster.yml b/roles/openshift_adm/tasks/wait_for_cluster.yml
@@ -50,6 +50,73 @@
   retries: "{{ cifmw_openshift_adm_retry_count }}"
   delay: 30
 
+# MachineConfigs applied during devscripts install (e.g. iSCSI, Cinder LVM)
+# trigger an MCO update cycle that continues asynchronously after the cluster
+# is first reachable.  On compact (3-master) clusters the MCO controller can
+# get stuck: all nodes reboot and report state=Done / desiredDrain=uncordon-*,
+# but the controller never issues the final kubectl-uncordon, leaving every
+# node SchedulingDisabled indefinitely.  We handle this with a loop that:
+#   1. Waits until no MCP is mid-update (unavailableMachineCount drops to 0)
+#      OR detects the stuck state (all updated, none ready).
+#   2. If stuck, uncordons all nodes to break the deadlock.
+#   3. Repeats until all MCPs report Updated=True.
+- name: Wait for MachineConfigPools to complete, fixing stuck cordons if needed.
+  when:
+    - not cifmw_openshift_adm_dry_run
+  environment:
+    KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
+    PATH: "{{ cifmw_path }}"
+  ansible.builtin.shell: |
+    set -eo pipefail
+    MCP_JSON=$(oc get mcp -o json)
+
+    UPDATING=$(echo "$MCP_JSON" | \
+      python3 -c "
+    import json, sys
+    data = json.load(sys.stdin)
+    updating = [
+        i['metadata']['name'] for i in data['items']
+        if next((c['status'] for c in i['status'].get('conditions', [])
+                  if c['type'] == 'Updating'), 'False') == 'True'
+    ]
+    print('\n'.join(updating))
+    ")
+
+    if [ -z "$UPDATING" ]; then
+      echo "All MCPs are up to date."
+      exit 0
+    fi
+
+    # At least one MCP is still Updating.  Check for the stuck-uncordon case:
+    # updatedMachineCount == machineCount but readyMachineCount == 0.
+    STUCK=$(echo "$MCP_JSON" | \
+      python3 -c "
+    import json, sys
+    data = json.load(sys.stdin)
+    stuck = [
+        i['metadata']['name'] for i in data['items']
+        if (i['status'].get('updatedMachineCount', 0) ==
+            i['status'].get('machineCount', 0) and
+            i['status'].get('readyMachineCount', 0) == 0 and
+            i['status'].get('machineCount', 0) > 0)
+    ]
+    print('\n'.join(stuck))
+    ")
+
+    if [ -n "$STUCK" ]; then
+      echo "Stuck MCPs detected: $STUCK -- uncordoning all nodes to break deadlock."
+      oc adm uncordon $(oc get nodes -o jsonpath='{.items[*].metadata.name}')
+    else
+      echo "MCPs still updating (normal progress): $UPDATING"
+    fi
+    exit 1
+  register: _mcp_wait
+  until: _mcp_wait.rc == 0
+  retries: 60
+  delay: 30
+  changed_when: "'uncordoning' in _mcp_wait.stdout"
+  failed_when: false
+
 - name: Check for pending certificate approval.
   when:
     - _openshift_adm_check_cert_approve | default(false) | bool