Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions pkg/mcr/mcr.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,22 @@ var (
)

// DrainNode drains a node from the workload via docker drain command.
// If the node is not part of a swarm (empty NodeID) the call is a no-op;
// this is the expected state after a forced swarm dissolution.
func DrainNode(lead *mkeconfig.Host, h *mkeconfig.Host) error {
nodeID, err := swarm.NodeID(h)
if err != nil {
return fmt.Errorf("failed to get node ID for %s: %w", h, err)
}

drainCmd := lead.Configurer.DockerCommandf("node update --availability drain %s", nodeID)
if err := lead.Exec(drainCmd); err != nil {
return fmt.Errorf("%s: failed to run MKE uninstaller: %w", lead, err)
if nodeID == "" {
log.Debugf("%s: not part of a swarm, skipping drain", h)
return nil
}

drainCmd := lead.Configurer.DockerCommandf("node update --availability drain %s", nodeID)
if err := lead.Exec(drainCmd); err != nil {
return fmt.Errorf("failed to drain node %s: %w", nodeID, err)
return fmt.Errorf("%s: failed to drain node %s: %w", lead, nodeID, err)
}

log.Infof("%s: node %s drained", lead, nodeID)
Expand Down
78 changes: 73 additions & 5 deletions pkg/product/mke/phase/uninstall_mke.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package phase

import (
"fmt"
"strings"

"github.com/Mirantis/launchpad/pkg/mke"
"github.com/Mirantis/launchpad/pkg/phase"
Expand Down Expand Up @@ -33,16 +34,36 @@ func (p *UninstallMKE) Run() error {

uninstallFlags := commonconfig.Flags{"--id", swarm.ClusterID(leader), "--purge-config"}

if _, err := mke.Bootstrap("uninstall-ucp", *p.Config, mke.BootstrapOptions{OperationFlags: uninstallFlags, ExecOptions: []exec.Option{exec.StreamOutput()}}); err != nil {
return fmt.Errorf("%s: failed to run MKE uninstaller: %w", leader, err)
// Capture both output and error: the timeout message ("Uninstalling UCP
// took too long") is emitted at error level by MKE and appears only in
// the streamed output, not in the returned error (which only aggregates
// fatal-level log lines from the bootstrapper).
output, err := mke.Bootstrap("uninstall-ucp", *p.Config, mke.BootstrapOptions{OperationFlags: uninstallFlags, ExecOptions: []exec.Option{exec.StreamOutput()}})
if err != nil {
// The uninstall-ucp bootstrapper deploys ucp-uninstall-agent as a global
// Swarm service and waits (hardcoded ~2 minutes) for every node to report
// back. On large clusters or hosts with cold image caches this deadline is
// missed. When that happens, MKE itself recommends:
// 1. Remove the stuck ucp-uninstall-agent service.
// 2. Force every node to leave the swarm.
// We implement that as an automatic fallback so that reset can continue
// to MCR uninstall without leaving a broken cluster behind.
if isUninstallTimeout(output) {
log.Warnf("%s: uninstall-ucp timed out waiting for nodes; falling back to forced swarm dissolution", leader)
if dissolveErr := dissolveSwarm(leader, p.Config.Spec.Hosts); dissolveErr != nil {
return fmt.Errorf("%s: uninstall-ucp timed out and forced swarm dissolution failed: %w (original: %w)", leader, dissolveErr, err)
}
log.Infof("%s: swarm dissolved; continuing with MCR uninstall", leader)
} else {
return fmt.Errorf("%s: failed to run MKE uninstaller: %w", leader, err)
}
}

managers := p.Config.Spec.Managers()
_ = managers.ParallelEach(func(h *mkeconfig.Host) error {
log.Infof("%s: removing ucp-controller-server-certs volume", h)
err := h.Exec(h.Configurer.DockerCommandf("volume rm --force ucp-controller-server-certs"))
if err != nil {
log.Errorf("%s: failed to remove the volume", h)
if err := h.Exec(h.Configurer.DockerCommandf("volume rm --force ucp-controller-server-certs")); err != nil {
log.Errorf("%s: failed to remove the volume: %v", h, err)
}

if err := h.Reboot(); err != nil {
Expand All @@ -61,3 +82,50 @@ func (p *UninstallMKE) Run() error {

return nil
}

// isUninstallTimeout returns true when the streamed output from the
// uninstall-ucp bootstrapper contains the well-known node-acknowledgement
// timeout message. MKE emits this at error level (not fatal), so it appears
// only in Bootstrap's output string, not in the returned error value.
func isUninstallTimeout(output string) bool {
return strings.Contains(output, "Uninstalling UCP took too long")
}

// dissolveSwarm forcibly tears down the Swarm cluster when uninstall-ucp
// cannot do so cleanly. It follows the recovery steps documented by MKE:
//
// 1. Remove the stuck ucp-uninstall-agent / ucp-uninstall-agent-win services
// from the swarm leader (best-effort; they may already be gone).
// 2. Force all non-leader nodes to leave the swarm in parallel.
// 3. Force the leader to leave last.
//
// Errors from individual nodes are logged as warnings so that a single
// unresponsive host does not prevent the rest of the cluster from being torn
// down. Only the leader's final leave is treated as a hard failure.
func dissolveSwarm(leader *mkeconfig.Host, hosts mkeconfig.Hosts) error {
// Step 1: remove the stuck uninstall-agent services (best-effort).
for _, svc := range []string{"ucp-uninstall-agent", "ucp-uninstall-agent-win"} {
log.Infof("%s: removing stuck service %s", leader, svc)
if err := leader.Exec(leader.Configurer.DockerCommandf("service rm %s", svc)); err != nil {
log.Debugf("%s: service rm %s: %v (may already be removed)", leader, svc, err)
}
}

// Step 2: force all non-leader nodes to leave the swarm.
nonLeaders := hosts.Filter(func(h *mkeconfig.Host) bool { return h != leader })
_ = nonLeaders.ParallelEach(func(h *mkeconfig.Host) error {
log.Infof("%s: force-leaving swarm", h)
if err := h.Exec(h.Configurer.DockerCommandf("swarm leave --force")); err != nil {
log.Warnf("%s: swarm leave --force failed: %v", h, err)
}
return nil // continue regardless; errors are warnings only
})

// Step 3: leader leaves last so it can still reach the other nodes above.
log.Infof("%s: force-leaving swarm (leader)", leader)
if err := leader.Exec(leader.Configurer.DockerCommandf("swarm leave --force")); err != nil {
return fmt.Errorf("swarm leader failed to leave: %w", err)
}

return nil
}
30 changes: 30 additions & 0 deletions pkg/product/mke/phase/uninstall_mke_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package phase

import (
"testing"
)

func TestIsUninstallTimeout(t *testing.T) {
t.Run("matches MKE timeout output", func(t *testing.T) {
// MKE emits this at error level; it appears in Bootstrap's output string.
output := "Uninstalling UCP took too long!\nThe following nodes are unable to uninstall within the timeout: abc123\n"
if !isUninstallTimeout(output) {
t.Errorf("expected isUninstallTimeout=true for MKE timeout output, got false")
}
})

t.Run("does not match generic uninstall failure output", func(t *testing.T) {
// "unable to cleanly uninstall UCP" is the fatal line — it should NOT
// trigger dissolution on its own; it can appear for non-timeout reasons.
output := "unable to cleanly uninstall UCP\n"
if isUninstallTimeout(output) {
t.Errorf("expected isUninstallTimeout=false for generic failure output, got true")
}
})

t.Run("does not match empty output", func(t *testing.T) {
if isUninstallTimeout("") {
t.Errorf("expected isUninstallTimeout=false for empty output, got true")
}
})
}
Loading