Skip to content

Commit 567899b

Browse files
committed
phase/uninstall_mke: fall back to forced swarm dissolution on timeout
The uninstall-ucp bootstrapper deploys ucp-uninstall-agent as a global Swarm service, then waits (~2 min hardcoded) for every node to report back. On large or mixed-OS clusters with cold image caches this deadline is missed, causing Reset() to fail even though the infrastructure will be torn down by terraform destroy anyway. Observed in CI: smoke-modern (MKE 3.9.2, 7 nodes): all nodes missed the deadline smoke-windows (MKE 3.8.8, Win2025): Win2025 node missed the deadline MKE itself documents the recovery path when this happens: 1. Remove the stuck ucp-uninstall-agent service. 2. Force every node to leave the swarm. Implement that as an automatic fallback inside UninstallMKE.Run(): - isUninstallTimeout() detects the specific 'Uninstalling UCP took too long' fatal line that Bootstrap surfaces from the MKE container. - dissolveSwarm() removes the stuck service (best-effort), then forces all non-leader nodes to leave in parallel, then forces the leader to leave last. Per-node failures are logged as warnings so that a single unresponsive host does not block the rest. Other uninstall-ucp errors (connection failures, image pull errors, etc.) are still returned as hard failures unchanged.
1 parent 5224c5d commit 567899b

2 files changed

Lines changed: 96 additions & 4 deletions

File tree

pkg/product/mke/phase/uninstall_mke.go

Lines changed: 67 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package phase
22

33
import (
44
"fmt"
5+
"strings"
56

67
"github.com/Mirantis/launchpad/pkg/mke"
78
"github.com/Mirantis/launchpad/pkg/phase"
@@ -34,15 +35,30 @@ func (p *UninstallMKE) Run() error {
3435
uninstallFlags := commonconfig.Flags{"--id", swarm.ClusterID(leader), "--purge-config"}
3536

3637
if _, err := mke.Bootstrap("uninstall-ucp", *p.Config, mke.BootstrapOptions{OperationFlags: uninstallFlags, ExecOptions: []exec.Option{exec.StreamOutput()}}); err != nil {
37-
return fmt.Errorf("%s: failed to run MKE uninstaller: %w", leader, err)
38+
// The uninstall-ucp bootstrapper deploys ucp-uninstall-agent as a global
39+
// Swarm service and waits (hardcoded ~2 minutes) for every node to report
40+
// back. On large clusters or hosts with cold image caches this deadline is
41+
// missed. When that happens, MKE itself recommends:
42+
// 1. Remove the stuck ucp-uninstall-agent service.
43+
// 2. Force every node to leave the swarm.
44+
// We implement that as an automatic fallback so that reset can continue
45+
// to MCR uninstall without leaving a broken cluster behind.
46+
if isUninstallTimeout(err) {
47+
log.Warnf("%s: uninstall-ucp timed out waiting for nodes; falling back to forced swarm dissolution", leader)
48+
if dissolveErr := dissolveSwarm(leader, p.Config.Spec.Hosts); dissolveErr != nil {
49+
return fmt.Errorf("%s: uninstall-ucp timed out and forced swarm dissolution failed: %w (original: %w)", leader, dissolveErr, err)
50+
}
51+
log.Infof("%s: swarm dissolved; continuing with MCR uninstall", leader)
52+
} else {
53+
return fmt.Errorf("%s: failed to run MKE uninstaller: %w", leader, err)
54+
}
3855
}
3956

4057
managers := p.Config.Spec.Managers()
4158
_ = managers.ParallelEach(func(h *mkeconfig.Host) error {
4259
log.Infof("%s: removing ucp-controller-server-certs volume", h)
43-
err := h.Exec(h.Configurer.DockerCommandf("volume rm --force ucp-controller-server-certs"))
44-
if err != nil {
45-
log.Errorf("%s: failed to remove the volume", h)
60+
if err := h.Exec(h.Configurer.DockerCommandf("volume rm --force ucp-controller-server-certs")); err != nil {
61+
log.Errorf("%s: failed to remove the volume: %v", h, err)
4662
}
4763

4864
if err := h.Reboot(); err != nil {
@@ -61,3 +77,50 @@ func (p *UninstallMKE) Run() error {
6177

6278
return nil
6379
}
80+
81+
// isUninstallTimeout returns true when the error from Bootstrap is the
82+
// well-known "took too long" timeout from the uninstall-ucp bootstrapper.
83+
// MKE emits this as a fatal log line when its per-node acknowledgement
84+
// deadline expires; Bootstrap surfaces it verbatim in the returned error.
85+
func isUninstallTimeout(err error) bool {
86+
return strings.Contains(err.Error(), "Uninstalling UCP took too long")
87+
}
88+
89+
// dissolveSwarm forcibly tears down the Swarm cluster when uninstall-ucp
90+
// cannot do so cleanly. It follows the recovery steps documented by MKE:
91+
//
92+
// 1. Remove the stuck ucp-uninstall-agent / ucp-uninstall-agent-win services
93+
// from the swarm leader (best-effort; they may already be gone).
94+
// 2. Force all non-leader nodes to leave the swarm in parallel.
95+
// 3. Force the leader to leave last.
96+
//
97+
// Errors from individual nodes are logged as warnings so that a single
98+
// unresponsive host does not prevent the rest of the cluster from being torn
99+
// down. Only the leader's final leave is treated as a hard failure.
100+
func dissolveSwarm(leader *mkeconfig.Host, hosts mkeconfig.Hosts) error {
101+
// Step 1: remove the stuck uninstall-agent services (best-effort).
102+
for _, svc := range []string{"ucp-uninstall-agent", "ucp-uninstall-agent-win"} {
103+
log.Infof("%s: removing stuck service %s", leader, svc)
104+
if err := leader.Exec(leader.Configurer.DockerCommandf("service rm %s", svc)); err != nil {
105+
log.Debugf("%s: service rm %s: %v (may already be removed)", leader, svc, err)
106+
}
107+
}
108+
109+
// Step 2: force all non-leader nodes to leave the swarm.
110+
nonLeaders := hosts.Filter(func(h *mkeconfig.Host) bool { return h != leader })
111+
_ = nonLeaders.ParallelEach(func(h *mkeconfig.Host) error {
112+
log.Infof("%s: force-leaving swarm", h)
113+
if err := h.Exec(h.Configurer.DockerCommandf("swarm leave --force")); err != nil {
114+
log.Warnf("%s: swarm leave --force failed: %v", h, err)
115+
}
116+
return nil // continue regardless; errors are warnings only
117+
})
118+
119+
// Step 3: leader leaves last so it can still reach the other nodes above.
120+
log.Infof("%s: force-leaving swarm (leader)", leader)
121+
if err := leader.Exec(leader.Configurer.DockerCommandf("swarm leave --force")); err != nil {
122+
return fmt.Errorf("swarm leader failed to leave: %w", err)
123+
}
124+
125+
return nil
126+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
package phase
2+
3+
import (
4+
"errors"
5+
"testing"
6+
)
7+
8+
func TestIsUninstallTimeout(t *testing.T) {
9+
t.Run("matches MKE timeout error", func(t *testing.T) {
10+
err := errors.New("mke bootstrap uninstall-ucp failure; MKE uninstall-ucp: Uninstalling UCP took too long!")
11+
if !isUninstallTimeout(err) {
12+
t.Errorf("expected isUninstallTimeout=true for MKE timeout error, got false")
13+
}
14+
})
15+
16+
t.Run("does not match unrelated error", func(t *testing.T) {
17+
err := errors.New("mke bootstrap uninstall-ucp failure; MKE uninstall-ucp: unable to cleanly uninstall UCP")
18+
if isUninstallTimeout(err) {
19+
t.Errorf("expected isUninstallTimeout=false for generic uninstall error, got true")
20+
}
21+
})
22+
23+
t.Run("does not match connection error", func(t *testing.T) {
24+
err := errors.New("[ssh] 1.2.3.4:22: connection refused")
25+
if isUninstallTimeout(err) {
26+
t.Errorf("expected isUninstallTimeout=false for connection error, got true")
27+
}
28+
})
29+
}

0 commit comments

Comments
 (0)