From f26b07eb6d736eadab011baa210bda1ab4d013d9 Mon Sep 17 00:00:00 2001
From: James Nesbitt <jnesbitt@mirantis.com>
Date: Mon, 11 May 2026 17:00:25 +0300
Subject: [PATCH] phase/uninstall_mke: fall back to forced swarm dissolution on
 timeout

The uninstall-ucp bootstrapper deploys ucp-uninstall-agent as a global
Swarm service, then waits (~2 min hardcoded) for every node to report
back. On large clusters or hosts with cold image caches this deadline is
missed, causing Reset() to fail.

Observed in CI:
  smoke-modern (MKE 3.9.2, 7 nodes): all nodes missed the deadline
  smoke-windows (MKE 3.8.8, Win2025): Win2025 missed the deadline

MKE documents the recovery path: remove the stuck ucp-uninstall-agent
service, then force every node to leave the swarm.

pkg/product/mke/phase/uninstall_mke.go:
  - Capture Bootstrap output (not just error): the timeout message
    'Uninstalling UCP took too long' is logged at error level by MKE and
    appears only in the output stream, not in the Bootstrap error value
    (which only aggregates fatal-level log lines).
  - isUninstallTimeout(output string) detects the timeout from the output.
  - dissolveSwarm() removes ucp-uninstall-agent/ucp-uninstall-agent-win
    from the leader (best-effort), force-leaves all non-leader nodes in
    parallel (per-node failures are warnings), then force-leaves the
    leader last (hard failure if this fails).
  - Non-timeout uninstall-ucp errors still propagate as hard failures.

pkg/mcr/mcr.go (DrainNode):
  - Empty NodeID guard: after forced swarm dissolution every node returns
    an empty NodeID from 'docker info'; previously this caused DrainNode
    to run 'docker node update --availability drain <empty>' which fails.
    Now treated as a no-op (node is already out of the swarm).
  - Also removed a pre-existing duplicate drainCmd execution (the command
    was being run twice on the happy path).

pkg/product/mke/phase/uninstall_mke_test.go:
  - Updated tests to match the new isUninstallTimeout(string) signature.

Signed-off-by: James Nesbitt <jnesbitt@mirantis.com>
---
 pkg/mcr/mcr.go                              | 12 ++--
 pkg/product/mke/phase/uninstall_mke.go      | 78 +++++++++++++++++++--
 pkg/product/mke/phase/uninstall_mke_test.go | 30 ++++++++
 3 files changed, 111 insertions(+), 9 deletions(-)
 create mode 100644 pkg/product/mke/phase/uninstall_mke_test.go
diff --git a/pkg/mcr/mcr.go b/pkg/mcr/mcr.go
index 8a51c644..4a45f2e7 100644
--- a/pkg/mcr/mcr.go
+++ b/pkg/mcr/mcr.go
@@ -16,18 +16,22 @@ var (
 )
 
 // DrainNode drains a node from the workload via docker drain command.
+// If the node is not part of a swarm (empty NodeID) the call is a no-op;
+// this is the expected state after a forced swarm dissolution.
 func DrainNode(lead *mkeconfig.Host, h *mkeconfig.Host) error {
 	nodeID, err := swarm.NodeID(h)
 	if err != nil {
 		return fmt.Errorf("failed to get node ID for %s: %w", h, err)
 	}
 
-	drainCmd := lead.Configurer.DockerCommandf("node update --availability drain %s", nodeID)
-	if err := lead.Exec(drainCmd); err != nil {
-		return fmt.Errorf("%s: failed to run MKE uninstaller: %w", lead, err)
+	if nodeID == "" {
+		log.Debugf("%s: not part of a swarm, skipping drain", h)
+		return nil
 	}
+
+	drainCmd := lead.Configurer.DockerCommandf("node update --availability drain %s", nodeID)
 	if err := lead.Exec(drainCmd); err != nil {
-		return fmt.Errorf("failed to drain node %s: %w", nodeID, err)
+		return fmt.Errorf("%s: failed to drain node %s: %w", lead, nodeID, err)
 	}
 
 	log.Infof("%s: node %s drained", lead, nodeID)
diff --git a/pkg/product/mke/phase/uninstall_mke.go b/pkg/product/mke/phase/uninstall_mke.go
index 314139b4..3ffba81c 100644
--- a/pkg/product/mke/phase/uninstall_mke.go
+++ b/pkg/product/mke/phase/uninstall_mke.go
@@ -2,6 +2,7 @@ package phase
 
 import (
 	"fmt"
+	"strings"
 
 	"github.com/Mirantis/launchpad/pkg/mke"
 	"github.com/Mirantis/launchpad/pkg/phase"
@@ -33,16 +34,36 @@ func (p *UninstallMKE) Run() error {
 
 	uninstallFlags := commonconfig.Flags{"--id", swarm.ClusterID(leader), "--purge-config"}
 
-	if _, err := mke.Bootstrap("uninstall-ucp", *p.Config, mke.BootstrapOptions{OperationFlags: uninstallFlags, ExecOptions: []exec.Option{exec.StreamOutput()}}); err != nil {
-		return fmt.Errorf("%s: failed to run MKE uninstaller: %w", leader, err)
+	// Capture both output and error: the timeout message ("Uninstalling UCP
+	// took too long") is emitted at error level by MKE and appears only in
+	// the streamed output, not in the returned error (which only aggregates
+	// fatal-level log lines from the bootstrapper).
+	output, err := mke.Bootstrap("uninstall-ucp", *p.Config, mke.BootstrapOptions{OperationFlags: uninstallFlags, ExecOptions: []exec.Option{exec.StreamOutput()}})
+	if err != nil {
+		// The uninstall-ucp bootstrapper deploys ucp-uninstall-agent as a global
+		// Swarm service and waits (hardcoded ~2 minutes) for every node to report
+		// back. On large clusters or hosts with cold image caches this deadline is
+		// missed. When that happens, MKE itself recommends:
+		//   1. Remove the stuck ucp-uninstall-agent service.
+		//   2. Force every node to leave the swarm.
+		// We implement that as an automatic fallback so that reset can continue
+		// to MCR uninstall without leaving a broken cluster behind.
+		if isUninstallTimeout(output) {
+			log.Warnf("%s: uninstall-ucp timed out waiting for nodes; falling back to forced swarm dissolution", leader)
+			if dissolveErr := dissolveSwarm(leader, p.Config.Spec.Hosts); dissolveErr != nil {
+				return fmt.Errorf("%s: uninstall-ucp timed out and forced swarm dissolution failed: %w (original: %w)", leader, dissolveErr, err)
+			}
+			log.Infof("%s: swarm dissolved; continuing with MCR uninstall", leader)
+		} else {
+			return fmt.Errorf("%s: failed to run MKE uninstaller: %w", leader, err)
+		}
 	}
 
 	managers := p.Config.Spec.Managers()
 	_ = managers.ParallelEach(func(h *mkeconfig.Host) error {
 		log.Infof("%s: removing ucp-controller-server-certs volume", h)
-		err := h.Exec(h.Configurer.DockerCommandf("volume rm --force ucp-controller-server-certs"))
-		if err != nil {
-			log.Errorf("%s: failed to remove the volume", h)
+		if err := h.Exec(h.Configurer.DockerCommandf("volume rm --force ucp-controller-server-certs")); err != nil {
+			log.Errorf("%s: failed to remove the volume: %v", h, err)
 		}
 
 		if err := h.Reboot(); err != nil {
@@ -61,3 +82,50 @@ func (p *UninstallMKE) Run() error {
 
 	return nil
 }
+
+// isUninstallTimeout returns true when the streamed output from the
+// uninstall-ucp bootstrapper contains the well-known node-acknowledgement
+// timeout message. MKE emits this at error level (not fatal), so it appears
+// only in Bootstrap's output string, not in the returned error value.
+func isUninstallTimeout(output string) bool {
+	return strings.Contains(output, "Uninstalling UCP took too long")
+}
+
+// dissolveSwarm forcibly tears down the Swarm cluster when uninstall-ucp
+// cannot do so cleanly. It follows the recovery steps documented by MKE:
+//
+//  1. Remove the stuck ucp-uninstall-agent / ucp-uninstall-agent-win services
+//     from the swarm leader (best-effort; they may already be gone).
+//  2. Force all non-leader nodes to leave the swarm in parallel.
+//  3. Force the leader to leave last.
+//
+// Errors from individual nodes are logged as warnings so that a single
+// unresponsive host does not prevent the rest of the cluster from being torn
+// down. Only the leader's final leave is treated as a hard failure.
+func dissolveSwarm(leader *mkeconfig.Host, hosts mkeconfig.Hosts) error {
+	// Step 1: remove the stuck uninstall-agent services (best-effort).
+	for _, svc := range []string{"ucp-uninstall-agent", "ucp-uninstall-agent-win"} {
+		log.Infof("%s: removing stuck service %s", leader, svc)
+		if err := leader.Exec(leader.Configurer.DockerCommandf("service rm %s", svc)); err != nil {
+			log.Debugf("%s: service rm %s: %v (may already be removed)", leader, svc, err)
+		}
+	}
+
+	// Step 2: force all non-leader nodes to leave the swarm.
+	nonLeaders := hosts.Filter(func(h *mkeconfig.Host) bool { return h != leader })
+	_ = nonLeaders.ParallelEach(func(h *mkeconfig.Host) error {
+		log.Infof("%s: force-leaving swarm", h)
+		if err := h.Exec(h.Configurer.DockerCommandf("swarm leave --force")); err != nil {
+			log.Warnf("%s: swarm leave --force failed: %v", h, err)
+		}
+		return nil // continue regardless; errors are warnings only
+	})
+
+	// Step 3: leader leaves last so it can still reach the other nodes above.
+	log.Infof("%s: force-leaving swarm (leader)", leader)
+	if err := leader.Exec(leader.Configurer.DockerCommandf("swarm leave --force")); err != nil {
+		return fmt.Errorf("swarm leader failed to leave: %w", err)
+	}
+
+	return nil
+}
diff --git a/pkg/product/mke/phase/uninstall_mke_test.go b/pkg/product/mke/phase/uninstall_mke_test.go
new file mode 100644
index 00000000..f2e69723
--- /dev/null
+++ b/pkg/product/mke/phase/uninstall_mke_test.go
@@ -0,0 +1,30 @@
+package phase
+
+import (
+	"testing"
+)
+
+func TestIsUninstallTimeout(t *testing.T) {
+	t.Run("matches MKE timeout output", func(t *testing.T) {
+		// MKE emits this at error level; it appears in Bootstrap's output string.
+		output := "Uninstalling UCP took too long!\nThe following nodes are unable to uninstall within the timeout: abc123\n"
+		if !isUninstallTimeout(output) {
+			t.Errorf("expected isUninstallTimeout=true for MKE timeout output, got false")
+		}
+	})
+
+	t.Run("does not match generic uninstall failure output", func(t *testing.T) {
+		// "unable to cleanly uninstall UCP" is the fatal line — it should NOT
+		// trigger dissolution on its own; it can appear for non-timeout reasons.
+		output := "unable to cleanly uninstall UCP\n"
+		if isUninstallTimeout(output) {
+			t.Errorf("expected isUninstallTimeout=false for generic failure output, got true")
+		}
+	})
+
+	t.Run("does not match empty output", func(t *testing.T) {
+		if isUninstallTimeout("") {
+			t.Errorf("expected isUninstallTimeout=false for empty output, got true")
+		}
+	})
+}