diff --git a/pkg/mcr/mcr.go b/pkg/mcr/mcr.go index 8a51c644..4a45f2e7 100644 --- a/pkg/mcr/mcr.go +++ b/pkg/mcr/mcr.go @@ -16,18 +16,22 @@ var ( ) // DrainNode drains a node from the workload via docker drain command. +// If the node is not part of a swarm (empty NodeID) the call is a no-op; +// this is the expected state after a forced swarm dissolution. func DrainNode(lead *mkeconfig.Host, h *mkeconfig.Host) error { nodeID, err := swarm.NodeID(h) if err != nil { return fmt.Errorf("failed to get node ID for %s: %w", h, err) } - drainCmd := lead.Configurer.DockerCommandf("node update --availability drain %s", nodeID) - if err := lead.Exec(drainCmd); err != nil { - return fmt.Errorf("%s: failed to run MKE uninstaller: %w", lead, err) + if nodeID == "" { + log.Debugf("%s: not part of a swarm, skipping drain", h) + return nil } + + drainCmd := lead.Configurer.DockerCommandf("node update --availability drain %s", nodeID) if err := lead.Exec(drainCmd); err != nil { - return fmt.Errorf("failed to drain node %s: %w", nodeID, err) + return fmt.Errorf("%s: failed to drain node %s: %w", lead, nodeID, err) } log.Infof("%s: node %s drained", lead, nodeID) diff --git a/pkg/product/mke/phase/uninstall_mke.go b/pkg/product/mke/phase/uninstall_mke.go index 314139b4..3ffba81c 100644 --- a/pkg/product/mke/phase/uninstall_mke.go +++ b/pkg/product/mke/phase/uninstall_mke.go @@ -2,6 +2,7 @@ package phase import ( "fmt" + "strings" "github.com/Mirantis/launchpad/pkg/mke" "github.com/Mirantis/launchpad/pkg/phase" @@ -33,16 +34,36 @@ func (p *UninstallMKE) Run() error { uninstallFlags := commonconfig.Flags{"--id", swarm.ClusterID(leader), "--purge-config"} - if _, err := mke.Bootstrap("uninstall-ucp", *p.Config, mke.BootstrapOptions{OperationFlags: uninstallFlags, ExecOptions: []exec.Option{exec.StreamOutput()}}); err != nil { - return fmt.Errorf("%s: failed to run MKE uninstaller: %w", leader, err) + // Capture both output and error: the timeout message ("Uninstalling UCP + // took too long") is emitted at error level by MKE and appears only in + // the streamed output, not in the returned error (which only aggregates + // fatal-level log lines from the bootstrapper). + output, err := mke.Bootstrap("uninstall-ucp", *p.Config, mke.BootstrapOptions{OperationFlags: uninstallFlags, ExecOptions: []exec.Option{exec.StreamOutput()}}) + if err != nil { + // The uninstall-ucp bootstrapper deploys ucp-uninstall-agent as a global + // Swarm service and waits (hardcoded ~2 minutes) for every node to report + // back. On large clusters or hosts with cold image caches this deadline is + // missed. When that happens, MKE itself recommends: + // 1. Remove the stuck ucp-uninstall-agent service. + // 2. Force every node to leave the swarm. + // We implement that as an automatic fallback so that reset can continue + // to MCR uninstall without leaving a broken cluster behind. + if isUninstallTimeout(output) { + log.Warnf("%s: uninstall-ucp timed out waiting for nodes; falling back to forced swarm dissolution", leader) + if dissolveErr := dissolveSwarm(leader, p.Config.Spec.Hosts); dissolveErr != nil { + return fmt.Errorf("%s: uninstall-ucp timed out and forced swarm dissolution failed: %w (original: %w)", leader, dissolveErr, err) + } + log.Infof("%s: swarm dissolved; continuing with MCR uninstall", leader) + } else { + return fmt.Errorf("%s: failed to run MKE uninstaller: %w", leader, err) + } } managers := p.Config.Spec.Managers() _ = managers.ParallelEach(func(h *mkeconfig.Host) error { log.Infof("%s: removing ucp-controller-server-certs volume", h) - err := h.Exec(h.Configurer.DockerCommandf("volume rm --force ucp-controller-server-certs")) - if err != nil { - log.Errorf("%s: failed to remove the volume", h) + if err := h.Exec(h.Configurer.DockerCommandf("volume rm --force ucp-controller-server-certs")); err != nil { + log.Errorf("%s: failed to remove the volume: %v", h, err) } if err := h.Reboot(); err != nil { @@ -61,3 +82,50 @@ func (p *UninstallMKE) Run() error { return nil } + +// isUninstallTimeout returns true when the streamed output from the +// uninstall-ucp bootstrapper contains the well-known node-acknowledgement +// timeout message. MKE emits this at error level (not fatal), so it appears +// only in Bootstrap's output string, not in the returned error value. +func isUninstallTimeout(output string) bool { + return strings.Contains(output, "Uninstalling UCP took too long") +} + +// dissolveSwarm forcibly tears down the Swarm cluster when uninstall-ucp +// cannot do so cleanly. It follows the recovery steps documented by MKE: +// +// 1. Remove the stuck ucp-uninstall-agent / ucp-uninstall-agent-win services +// from the swarm leader (best-effort; they may already be gone). +// 2. Force all non-leader nodes to leave the swarm in parallel. +// 3. Force the leader to leave last. +// +// Errors from individual nodes are logged as warnings so that a single +// unresponsive host does not prevent the rest of the cluster from being torn +// down. Only the leader's final leave is treated as a hard failure. +func dissolveSwarm(leader *mkeconfig.Host, hosts mkeconfig.Hosts) error { + // Step 1: remove the stuck uninstall-agent services (best-effort). + for _, svc := range []string{"ucp-uninstall-agent", "ucp-uninstall-agent-win"} { + log.Infof("%s: removing stuck service %s", leader, svc) + if err := leader.Exec(leader.Configurer.DockerCommandf("service rm %s", svc)); err != nil { + log.Debugf("%s: service rm %s: %v (may already be removed)", leader, svc, err) + } + } + + // Step 2: force all non-leader nodes to leave the swarm. + nonLeaders := hosts.Filter(func(h *mkeconfig.Host) bool { return h != leader }) + _ = nonLeaders.ParallelEach(func(h *mkeconfig.Host) error { + log.Infof("%s: force-leaving swarm", h) + if err := h.Exec(h.Configurer.DockerCommandf("swarm leave --force")); err != nil { + log.Warnf("%s: swarm leave --force failed: %v", h, err) + } + return nil // continue regardless; errors are warnings only + }) + + // Step 3: leader leaves last so it can still reach the other nodes above. + log.Infof("%s: force-leaving swarm (leader)", leader) + if err := leader.Exec(leader.Configurer.DockerCommandf("swarm leave --force")); err != nil { + return fmt.Errorf("swarm leader failed to leave: %w", err) + } + + return nil +} diff --git a/pkg/product/mke/phase/uninstall_mke_test.go b/pkg/product/mke/phase/uninstall_mke_test.go new file mode 100644 index 00000000..f2e69723 --- /dev/null +++ b/pkg/product/mke/phase/uninstall_mke_test.go @@ -0,0 +1,30 @@ +package phase + +import ( + "testing" +) + +func TestIsUninstallTimeout(t *testing.T) { + t.Run("matches MKE timeout output", func(t *testing.T) { + // MKE emits this at error level; it appears in Bootstrap's output string. + output := "Uninstalling UCP took too long!\nThe following nodes are unable to uninstall within the timeout: abc123\n" + if !isUninstallTimeout(output) { + t.Errorf("expected isUninstallTimeout=true for MKE timeout output, got false") + } + }) + + t.Run("does not match generic uninstall failure output", func(t *testing.T) { + // "unable to cleanly uninstall UCP" is the fatal line — it should NOT + // trigger dissolution on its own; it can appear for non-timeout reasons. + output := "unable to cleanly uninstall UCP\n" + if isUninstallTimeout(output) { + t.Errorf("expected isUninstallTimeout=false for generic failure output, got true") + } + }) + + t.Run("does not match empty output", func(t *testing.T) { + if isUninstallTimeout("") { + t.Errorf("expected isUninstallTimeout=false for empty output, got true") + } + }) +}