Skip to content

Commit f26b07e

Browse files
committed
phase/uninstall_mke: fall back to forced swarm dissolution on timeout
The uninstall-ucp bootstrapper deploys ucp-uninstall-agent as a global Swarm service, then waits (~2 min hardcoded) for every node to report back. On large clusters or hosts with cold image caches this deadline is missed, causing Reset() to fail. Observed in CI: smoke-modern (MKE 3.9.2, 7 nodes): all nodes missed the deadline smoke-windows (MKE 3.8.8, Win2025): Win2025 missed the deadline MKE documents the recovery path: remove the stuck ucp-uninstall-agent service, then force every node to leave the swarm. pkg/product/mke/phase/uninstall_mke.go: - Capture Bootstrap output (not just error): the timeout message 'Uninstalling UCP took too long' is logged at error level by MKE and appears only in the output stream, not in the Bootstrap error value (which only aggregates fatal-level log lines). - isUninstallTimeout(output string) detects the timeout from the output. - dissolveSwarm() removes ucp-uninstall-agent/ucp-uninstall-agent-win from the leader (best-effort), force-leaves all non-leader nodes in parallel (per-node failures are warnings), then force-leaves the leader last (hard failure if this fails). - Non-timeout uninstall-ucp errors still propagate as hard failures. pkg/mcr/mcr.go (DrainNode): - Empty NodeID guard: after forced swarm dissolution every node returns an empty NodeID from 'docker info'; previously this caused DrainNode to run 'docker node update --availability drain <empty>' which fails. Now treated as a no-op (node is already out of the swarm). - Also removed a pre-existing duplicate drainCmd execution (the command was being run twice on the happy path). pkg/product/mke/phase/uninstall_mke_test.go: - Updated tests to match the new isUninstallTimeout(string) signature. Signed-off-by: James Nesbitt <jnesbitt@mirantis.com>
1 parent c691642 commit f26b07e

3 files changed

Lines changed: 111 additions & 9 deletions

File tree

pkg/mcr/mcr.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,18 +16,22 @@ var (
1616
)
1717

1818
// DrainNode drains a node from the workload via docker drain command.
19+
// If the node is not part of a swarm (empty NodeID) the call is a no-op;
20+
// this is the expected state after a forced swarm dissolution.
1921
func DrainNode(lead *mkeconfig.Host, h *mkeconfig.Host) error {
2022
nodeID, err := swarm.NodeID(h)
2123
if err != nil {
2224
return fmt.Errorf("failed to get node ID for %s: %w", h, err)
2325
}
2426

25-
drainCmd := lead.Configurer.DockerCommandf("node update --availability drain %s", nodeID)
26-
if err := lead.Exec(drainCmd); err != nil {
27-
return fmt.Errorf("%s: failed to run MKE uninstaller: %w", lead, err)
27+
if nodeID == "" {
28+
log.Debugf("%s: not part of a swarm, skipping drain", h)
29+
return nil
2830
}
31+
32+
drainCmd := lead.Configurer.DockerCommandf("node update --availability drain %s", nodeID)
2933
if err := lead.Exec(drainCmd); err != nil {
30-
return fmt.Errorf("failed to drain node %s: %w", nodeID, err)
34+
return fmt.Errorf("%s: failed to drain node %s: %w", lead, nodeID, err)
3135
}
3236

3337
log.Infof("%s: node %s drained", lead, nodeID)

pkg/product/mke/phase/uninstall_mke.go

Lines changed: 73 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package phase
22

33
import (
44
"fmt"
5+
"strings"
56

67
"github.com/Mirantis/launchpad/pkg/mke"
78
"github.com/Mirantis/launchpad/pkg/phase"
@@ -33,16 +34,36 @@ func (p *UninstallMKE) Run() error {
3334

3435
uninstallFlags := commonconfig.Flags{"--id", swarm.ClusterID(leader), "--purge-config"}
3536

36-
if _, err := mke.Bootstrap("uninstall-ucp", *p.Config, mke.BootstrapOptions{OperationFlags: uninstallFlags, ExecOptions: []exec.Option{exec.StreamOutput()}}); err != nil {
37-
return fmt.Errorf("%s: failed to run MKE uninstaller: %w", leader, err)
37+
// Capture both output and error: the timeout message ("Uninstalling UCP
38+
// took too long") is emitted at error level by MKE and appears only in
39+
// the streamed output, not in the returned error (which only aggregates
40+
// fatal-level log lines from the bootstrapper).
41+
output, err := mke.Bootstrap("uninstall-ucp", *p.Config, mke.BootstrapOptions{OperationFlags: uninstallFlags, ExecOptions: []exec.Option{exec.StreamOutput()}})
42+
if err != nil {
43+
// The uninstall-ucp bootstrapper deploys ucp-uninstall-agent as a global
44+
// Swarm service and waits (hardcoded ~2 minutes) for every node to report
45+
// back. On large clusters or hosts with cold image caches this deadline is
46+
// missed. When that happens, MKE itself recommends:
47+
// 1. Remove the stuck ucp-uninstall-agent service.
48+
// 2. Force every node to leave the swarm.
49+
// We implement that as an automatic fallback so that reset can continue
50+
// to MCR uninstall without leaving a broken cluster behind.
51+
if isUninstallTimeout(output) {
52+
log.Warnf("%s: uninstall-ucp timed out waiting for nodes; falling back to forced swarm dissolution", leader)
53+
if dissolveErr := dissolveSwarm(leader, p.Config.Spec.Hosts); dissolveErr != nil {
54+
return fmt.Errorf("%s: uninstall-ucp timed out and forced swarm dissolution failed: %w (original: %w)", leader, dissolveErr, err)
55+
}
56+
log.Infof("%s: swarm dissolved; continuing with MCR uninstall", leader)
57+
} else {
58+
return fmt.Errorf("%s: failed to run MKE uninstaller: %w", leader, err)
59+
}
3860
}
3961

4062
managers := p.Config.Spec.Managers()
4163
_ = managers.ParallelEach(func(h *mkeconfig.Host) error {
4264
log.Infof("%s: removing ucp-controller-server-certs volume", h)
43-
err := h.Exec(h.Configurer.DockerCommandf("volume rm --force ucp-controller-server-certs"))
44-
if err != nil {
45-
log.Errorf("%s: failed to remove the volume", h)
65+
if err := h.Exec(h.Configurer.DockerCommandf("volume rm --force ucp-controller-server-certs")); err != nil {
66+
log.Errorf("%s: failed to remove the volume: %v", h, err)
4667
}
4768

4869
if err := h.Reboot(); err != nil {
@@ -61,3 +82,50 @@ func (p *UninstallMKE) Run() error {
6182

6283
return nil
6384
}
85+
86+
// isUninstallTimeout returns true when the streamed output from the
87+
// uninstall-ucp bootstrapper contains the well-known node-acknowledgement
88+
// timeout message. MKE emits this at error level (not fatal), so it appears
89+
// only in Bootstrap's output string, not in the returned error value.
90+
func isUninstallTimeout(output string) bool {
91+
return strings.Contains(output, "Uninstalling UCP took too long")
92+
}
93+
94+
// dissolveSwarm forcibly tears down the Swarm cluster when uninstall-ucp
95+
// cannot do so cleanly. It follows the recovery steps documented by MKE:
96+
//
97+
// 1. Remove the stuck ucp-uninstall-agent / ucp-uninstall-agent-win services
98+
// from the swarm leader (best-effort; they may already be gone).
99+
// 2. Force all non-leader nodes to leave the swarm in parallel.
100+
// 3. Force the leader to leave last.
101+
//
102+
// Errors from individual nodes are logged as warnings so that a single
103+
// unresponsive host does not prevent the rest of the cluster from being torn
104+
// down. Only the leader's final leave is treated as a hard failure.
105+
func dissolveSwarm(leader *mkeconfig.Host, hosts mkeconfig.Hosts) error {
106+
// Step 1: remove the stuck uninstall-agent services (best-effort).
107+
for _, svc := range []string{"ucp-uninstall-agent", "ucp-uninstall-agent-win"} {
108+
log.Infof("%s: removing stuck service %s", leader, svc)
109+
if err := leader.Exec(leader.Configurer.DockerCommandf("service rm %s", svc)); err != nil {
110+
log.Debugf("%s: service rm %s: %v (may already be removed)", leader, svc, err)
111+
}
112+
}
113+
114+
// Step 2: force all non-leader nodes to leave the swarm.
115+
nonLeaders := hosts.Filter(func(h *mkeconfig.Host) bool { return h != leader })
116+
_ = nonLeaders.ParallelEach(func(h *mkeconfig.Host) error {
117+
log.Infof("%s: force-leaving swarm", h)
118+
if err := h.Exec(h.Configurer.DockerCommandf("swarm leave --force")); err != nil {
119+
log.Warnf("%s: swarm leave --force failed: %v", h, err)
120+
}
121+
return nil // continue regardless; errors are warnings only
122+
})
123+
124+
// Step 3: leader leaves last so it can still reach the other nodes above.
125+
log.Infof("%s: force-leaving swarm (leader)", leader)
126+
if err := leader.Exec(leader.Configurer.DockerCommandf("swarm leave --force")); err != nil {
127+
return fmt.Errorf("swarm leader failed to leave: %w", err)
128+
}
129+
130+
return nil
131+
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
package phase
2+
3+
import (
4+
"testing"
5+
)
6+
7+
func TestIsUninstallTimeout(t *testing.T) {
8+
t.Run("matches MKE timeout output", func(t *testing.T) {
9+
// MKE emits this at error level; it appears in Bootstrap's output string.
10+
output := "Uninstalling UCP took too long!\nThe following nodes are unable to uninstall within the timeout: abc123\n"
11+
if !isUninstallTimeout(output) {
12+
t.Errorf("expected isUninstallTimeout=true for MKE timeout output, got false")
13+
}
14+
})
15+
16+
t.Run("does not match generic uninstall failure output", func(t *testing.T) {
17+
// "unable to cleanly uninstall UCP" is the fatal line — it should NOT
18+
// trigger dissolution on its own; it can appear for non-timeout reasons.
19+
output := "unable to cleanly uninstall UCP\n"
20+
if isUninstallTimeout(output) {
21+
t.Errorf("expected isUninstallTimeout=false for generic failure output, got true")
22+
}
23+
})
24+
25+
t.Run("does not match empty output", func(t *testing.T) {
26+
if isUninstallTimeout("") {
27+
t.Errorf("expected isUninstallTimeout=false for empty output, got true")
28+
}
29+
})
30+
}

0 commit comments

Comments
 (0)