From 0bbf88132d1ebde2744263602637be30724b55a3 Mon Sep 17 00:00:00 2001 From: Rajath Agasthya Date: Wed, 10 Jun 2026 13:00:56 -0500 Subject: [PATCH] Restart driver pods in place when driver config is unchanged A patch chart upgrade can change only cosmetic pod-template metadata (e.g. the helm.sh/chart label) without changing the driver itself. The upgrade controller keys on the DaemonSet's controller revision hash, so such a change still evicts running GPU workloads and drains the node -- for no driver benefit. Register a RestartOnlyPredicate on the upgrade state manager that compares DRIVER_CONFIG_DIGEST -- a hash of the install-relevant driver config, already set on the driver pod template -- between the running pod and the desired DaemonSet. When the digests match, the node is cordoned and the driver pod restarted in place, with no workload eviction or drain; the driver fast-path keeps the kernel modules loaded across the restart, so running GPU workloads are not disrupted. Cordoning keeps the node unschedulable if the restart fails, and the node is uncordoned on success. A missing or differing digest falls back to the full upgrade flow. The digest env name and a reader for it live in internal/config beside the digest definition; the restart-only routing decision lives in internal/predicates and is registered on the upgrade state manager in main.go. The RestartOnlyPredicate hook it relies on is provided by k8s-operator-libs, vendored here at the merged version. Signed-off-by: Rajath Agasthya --- cmd/gpu-operator/main.go | 6 +- controllers/object_controls.go | 6 +- go.mod | 2 +- go.sum | 4 +- internal/config/driver_config_digest.go | 32 ++++++++ internal/config/driver_config_digest_test.go | 78 +++++++++++++++++++ internal/predicates/restart_only.go | 50 ++++++++++++ internal/predicates/restart_only_test.go | 59 ++++++++++++++ .../pkg/upgrade/common_manager.go | 13 ++++ .../pkg/upgrade/upgrade_inplace.go | 76 ++++++++++++++++-- .../pkg/upgrade/upgrade_state.go | 11 +++ vendor/modules.txt | 2 +- 12 files changed, 326 insertions(+), 13 deletions(-) create mode 100644 internal/predicates/restart_only.go create mode 100644 internal/predicates/restart_only_test.go diff --git a/cmd/gpu-operator/main.go b/cmd/gpu-operator/main.go index 9ac5df1072..c7517e6fc8 100644 --- a/cmd/gpu-operator/main.go +++ b/cmd/gpu-operator/main.go @@ -51,6 +51,7 @@ import ( "github.com/NVIDIA/gpu-operator/controllers/clusterinfo" "github.com/NVIDIA/gpu-operator/internal/consts" "github.com/NVIDIA/gpu-operator/internal/info" + "github.com/NVIDIA/gpu-operator/internal/predicates" // +kubebuilder:scaffold:imports ) @@ -184,7 +185,10 @@ func main() { setupLog.Error(err, "unable to create new ClusterUpdateStateManager", "controller", "Upgrade") os.Exit(1) } - clusterUpgradeStateManager = clusterUpgradeStateManager.WithPodDeletionEnabled(gpuPodSpecFilter).WithValidationEnabled("app=nvidia-operator-validator") + clusterUpgradeStateManager = clusterUpgradeStateManager. + WithPodDeletionEnabled(gpuPodSpecFilter). + WithValidationEnabled("app=nvidia-operator-validator"). + WithRestartOnlyPredicate(predicates.DriverPodRestartOnly(upgradeLogger)) if err = (&controllers.UpgradeReconciler{ Client: mgr.GetClient(), diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 299da2a45b..aa6a54930b 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -1064,19 +1064,19 @@ func TransformDriver(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n C // Set the computed digest in driver-manager initContainer driverManagerContainer := findContainerByName(obj.Spec.Template.Spec.InitContainers, "k8s-driver-manager") if driverManagerContainer != nil { - setContainerEnv(driverManagerContainer, "DRIVER_CONFIG_DIGEST", configDigest) + setContainerEnv(driverManagerContainer, driverconfig.DriverConfigDigestEnvName, configDigest) } // Set the computed digest in nvidia-driver container driverContainer := findContainerByName(obj.Spec.Template.Spec.Containers, "nvidia-driver-ctr") if driverContainer != nil { - setContainerEnv(driverContainer, "DRIVER_CONFIG_DIGEST", configDigest) + setContainerEnv(driverContainer, driverconfig.DriverConfigDigestEnvName, configDigest) } // Used by dtk-build-driver to determine if fast path should be used (skip rebuild) driverToolkitContainer := findContainerByName(obj.Spec.Template.Spec.Containers, "openshift-driver-toolkit-ctr") if driverToolkitContainer != nil { - setContainerEnv(driverToolkitContainer, "DRIVER_CONFIG_DIGEST", configDigest) + setContainerEnv(driverToolkitContainer, driverconfig.DriverConfigDigestEnvName, configDigest) } // set hostNetwork for driver if specified diff --git a/go.mod b/go.mod index 5c1a582a3a..fd02e5230d 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ require ( github.com/Masterminds/sprig/v3 v3.3.0 github.com/NVIDIA/go-nvlib v0.11.0 github.com/NVIDIA/k8s-kata-manager v0.2.3 - github.com/NVIDIA/k8s-operator-libs v0.0.0-20260505175649-fa6a3643c441 + github.com/NVIDIA/k8s-operator-libs v0.0.0-20260629200812-d720f2557494 github.com/NVIDIA/nvidia-container-toolkit v1.19.1 github.com/cyphar/filepath-securejoin v0.7.0 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc diff --git a/go.sum b/go.sum index 40fb51e6e6..147e583551 100644 --- a/go.sum +++ b/go.sum @@ -18,8 +18,8 @@ github.com/NVIDIA/go-nvlib v0.11.0 h1:J6c9deWGJ1x4yY7fKg+aOdm2v5+WmCIeCLsuaO3tRt github.com/NVIDIA/go-nvlib v0.11.0/go.mod h1:uQNH63NoDuSfn/1lixD1D1Hvhko/xdnBHmc4H1mFUlY= github.com/NVIDIA/k8s-kata-manager v0.2.3 h1:d5+gRFqU5el/fKMXhHUaPY7haj+dbHL4nDsO/q05LBo= github.com/NVIDIA/k8s-kata-manager v0.2.3/go.mod h1:xx5OUiMsHyKbyX0JjKHqAftvqS8vx00LFn/5EaMdtB4= -github.com/NVIDIA/k8s-operator-libs v0.0.0-20260505175649-fa6a3643c441 h1:U+1f77CBKtvJEL/wzze5mY2+Y3XQ5ZgRK0R2Ru2phz4= -github.com/NVIDIA/k8s-operator-libs v0.0.0-20260505175649-fa6a3643c441/go.mod h1:L+aiCiTKN63AX9SWz/F8pv9Jw9FIfI+dAEr7VA+KowE= +github.com/NVIDIA/k8s-operator-libs v0.0.0-20260629200812-d720f2557494 h1:j+tWK79l9AouBulQps7rxILLhy2fWYcEhH4zgYjth/o= +github.com/NVIDIA/k8s-operator-libs v0.0.0-20260629200812-d720f2557494/go.mod h1:L+aiCiTKN63AX9SWz/F8pv9Jw9FIfI+dAEr7VA+KowE= github.com/NVIDIA/nvidia-container-toolkit v1.19.1 h1:1sV4ddFrBccqL9Lbzcdu50w2j5FhyNJpN5hXTfCsjps= github.com/NVIDIA/nvidia-container-toolkit v1.19.1/go.mod h1:yGsZ4s2lMjfE4r8/DMUPVpaFhRGkWvo2H++/Dy84nVc= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= diff --git a/internal/config/driver_config_digest.go b/internal/config/driver_config_digest.go index 565b14ed45..526d76a846 100644 --- a/internal/config/driver_config_digest.go +++ b/internal/config/driver_config_digest.go @@ -22,6 +22,38 @@ import ( corev1 "k8s.io/api/core/v1" ) +// DriverConfigDigestEnvName is the env var the operator sets on the driver pod +// template, carrying a hash of the install-relevant driver config (DriverInstallState). +const DriverConfigDigestEnvName = "DRIVER_CONFIG_DIGEST" + +// DriverConfigDigestFromPodSpec returns the DRIVER_CONFIG_DIGEST value from a driver +// pod spec, or "" if absent. The env is set identically on every driver container, so +// the first non-empty value (init containers first) is returned. +func DriverConfigDigestFromPodSpec(spec *corev1.PodSpec) string { + if spec == nil { + return "" + } + digestFromEnv := func(env []corev1.EnvVar) string { + for _, e := range env { + if e.Name == DriverConfigDigestEnvName { + return e.Value + } + } + return "" + } + for _, initCtr := range spec.InitContainers { + if v := digestFromEnv(initCtr.Env); v != "" { + return v + } + } + for _, ctr := range spec.Containers { + if v := digestFromEnv(ctr.Env); v != "" { + return v + } + } + return "" +} + // DriverInstallState lists all fields that affect driver installation. // Changes to these fields trigger a driver reinstall. // diff --git a/internal/config/driver_config_digest_test.go b/internal/config/driver_config_digest_test.go index b9adae2e41..10da9cd0a4 100644 --- a/internal/config/driver_config_digest_test.go +++ b/internal/config/driver_config_digest_test.go @@ -309,3 +309,81 @@ func TestExtractVolumes(t *testing.T) { }) } } + +// containerWithConfigDigest builds a container carrying the DRIVER_CONFIG_DIGEST env +// when digest is non-empty (matching how object_controls.go sets it). +func containerWithConfigDigest(name, digest string) corev1.Container { + c := corev1.Container{Name: name} + if digest != "" { + c.Env = []corev1.EnvVar{{Name: DriverConfigDigestEnvName, Value: digest}} + } + return c +} + +func TestDriverConfigDigestFromPodSpec(t *testing.T) { + tests := []struct { + name string + spec *corev1.PodSpec + want string + }{ + { + name: "digest on k8s-driver-manager init container", + spec: &corev1.PodSpec{ + InitContainers: []corev1.Container{containerWithConfigDigest("k8s-driver-manager", "abc123")}, + Containers: []corev1.Container{containerWithConfigDigest("nvidia-driver-ctr", "")}, + }, + want: "abc123", + }, + { + name: "digest on nvidia-driver-ctr main container", + spec: &corev1.PodSpec{ + Containers: []corev1.Container{containerWithConfigDigest("nvidia-driver-ctr", "def456")}, + }, + want: "def456", + }, + { + name: "digest on OCP openshift-driver-toolkit-ctr", + spec: &corev1.PodSpec{ + Containers: []corev1.Container{containerWithConfigDigest("openshift-driver-toolkit-ctr", "ocp789")}, + }, + want: "ocp789", + }, + { + name: "init container digest takes precedence over main container", + spec: &corev1.PodSpec{ + InitContainers: []corev1.Container{containerWithConfigDigest("k8s-driver-manager", "init-digest")}, + Containers: []corev1.Container{containerWithConfigDigest("nvidia-driver-ctr", "main-digest")}, + }, + want: "init-digest", + }, + { + name: "empty init digest is skipped; main container value used", + spec: &corev1.PodSpec{ + InitContainers: []corev1.Container{{ + Name: "k8s-driver-manager", + Env: []corev1.EnvVar{{Name: DriverConfigDigestEnvName, Value: ""}}, + }}, + Containers: []corev1.Container{containerWithConfigDigest("nvidia-driver-ctr", "main-digest")}, + }, + want: "main-digest", + }, + { + name: "no digest anywhere", + spec: &corev1.PodSpec{ + InitContainers: []corev1.Container{{Name: "k8s-driver-manager"}}, + Containers: []corev1.Container{{Name: "nvidia-driver-ctr"}}, + }, + want: "", + }, + { + name: "nil spec", + spec: nil, + want: "", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, DriverConfigDigestFromPodSpec(tt.spec)) + }) + } +} diff --git a/internal/predicates/restart_only.go b/internal/predicates/restart_only.go new file mode 100644 index 0000000000..f1cf2b3b4e --- /dev/null +++ b/internal/predicates/restart_only.go @@ -0,0 +1,50 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +// Package predicates holds predicates the upgrade controller registers on the +// k8s-operator-libs upgrade state manager. +package predicates + +import ( + "github.com/go-logr/logr" + corev1 "k8s.io/api/core/v1" + + "github.com/NVIDIA/k8s-operator-libs/pkg/consts" + "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" + + driverconfig "github.com/NVIDIA/gpu-operator/internal/config" +) + +// DriverPodRestartOnly returns the upgrade controller's RestartOnlyPredicate: it allows an +// out-of-sync driver pod to be restarted in place when the running pod spec and the desired +// DaemonSet template spec have the same DRIVER_CONFIG_DIGEST, i.e. the install-relevant +// config is unchanged (e.g. only a helm.sh/chart label changed). If either digest is missing, +// it returns false and the node takes the full upgrade flow. +func DriverPodRestartOnly(log logr.Logger) upgrade.RestartOnlyPredicate { + return func(running, desired *corev1.PodSpec) (bool, error) { + desiredDigest := driverconfig.DriverConfigDigestFromPodSpec(desired) + runningDigest := driverconfig.DriverConfigDigestFromPodSpec(running) + if desiredDigest == "" || runningDigest == "" { + log.V(consts.LogLevelDebug).Info("driver config digest missing; taking full upgrade flow", + "desiredDigest", desiredDigest, "runningDigest", runningDigest) + return false, nil + } + restartOnly := desiredDigest == runningDigest + log.V(consts.LogLevelDebug).Info("evaluated driver config digest for restart-only routing", + "desiredDigest", desiredDigest, "runningDigest", runningDigest, "restartOnly", restartOnly) + return restartOnly, nil + } +} diff --git a/internal/predicates/restart_only_test.go b/internal/predicates/restart_only_test.go new file mode 100644 index 0000000000..33d12243c7 --- /dev/null +++ b/internal/predicates/restart_only_test.go @@ -0,0 +1,59 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package predicates + +import ( + "testing" + + "github.com/go-logr/logr" + "github.com/stretchr/testify/assert" + corev1 "k8s.io/api/core/v1" + + driverconfig "github.com/NVIDIA/gpu-operator/internal/config" +) + +func TestDriverPodRestartOnly(t *testing.T) { + podSpec := func(digest string) *corev1.PodSpec { + return &corev1.PodSpec{Containers: []corev1.Container{{ + Name: "nvidia-driver-ctr", + Env: []corev1.EnvVar{{Name: driverconfig.DriverConfigDigestEnvName, Value: digest}}, + }}} + } + + predicate := DriverPodRestartOnly(logr.Discard()) + + tests := []struct { + name string + running *corev1.PodSpec + desired *corev1.PodSpec + wantRestart bool + }{ + {name: "equal digests -> restart-only", running: podSpec("same"), desired: podSpec("same"), wantRestart: true}, + {name: "differing digests -> full upgrade", running: podSpec("old"), desired: podSpec("new"), wantRestart: false}, + {name: "missing digest on running pod -> full upgrade", running: podSpec(""), desired: podSpec("new"), wantRestart: false}, + {name: "missing digest on desired template -> full upgrade", running: podSpec("old"), desired: podSpec(""), wantRestart: false}, + {name: "nil running spec -> full upgrade", running: nil, desired: podSpec("x"), wantRestart: false}, + {name: "nil desired spec -> full upgrade", running: podSpec("x"), desired: nil, wantRestart: false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := predicate(tt.running, tt.desired) + assert.NoError(t, err) + assert.Equal(t, tt.wantRestart, got) + }) + } +} diff --git a/vendor/github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/common_manager.go b/vendor/github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/common_manager.go index 52b47b31c0..2a16d979e2 100644 --- a/vendor/github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/common_manager.go +++ b/vendor/github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/common_manager.go @@ -79,6 +79,17 @@ func NewClusterUpgradeState() ClusterUpgradeState { return ClusterUpgradeState{NodeStates: make(map[string][]*NodeUpgradeState)} } +// RestartOnlyPredicate is used for a node whose driver pod is out-of-sync with its +// DaemonSet. running is the live driver pod's spec; desired is the DaemonSet template's +// pod spec. Returning true means the difference does not affect the installed driver, so +// the node is cordoned and the driver pod restarted in place, skipping pod-deletion +// (workload eviction) and drain; the consumer guarantees the running driver does not need +// to change across the restart. Returning false (the default when unset) routes the node +// through the full upgrade flow. Returning an error keeps the node in upgrade-required to +// be retried on a later reconcile. It is never called for orphaned pods, upgrade-requested +// nodes, or nodes waiting for safe driver load. +type RestartOnlyPredicate func(running, desired *corev1.PodSpec) (bool, error) + // CommonUpgradeManagerImpl is an implementation of the CommonUpgradeStateManager interface. // It facilitates common logic implementation for both upgrade modes: in-place and requestor (e.g. maintenance OP). type CommonUpgradeManagerImpl struct { @@ -97,6 +108,8 @@ type CommonUpgradeManagerImpl struct { // optional states podDeletionStateEnabled bool validationStateEnabled bool + // optional: when set, route immaterial pod-template changes to a restart-only path + restartOnlyPredicate RestartOnlyPredicate } // NewCommonUpgradeStateManager creates a new instance of CommonUpgradeManagerImpl diff --git a/vendor/github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/upgrade_inplace.go b/vendor/github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/upgrade_inplace.go index 02fccd3659..d11f0dc4b1 100644 --- a/vendor/github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/upgrade_inplace.go +++ b/vendor/github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/upgrade_inplace.go @@ -18,7 +18,9 @@ package upgrade import ( "context" + "fmt" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/intstr" "github.com/NVIDIA/k8s-operator-libs/api/upgrade/v1alpha1" @@ -69,7 +71,8 @@ func (m *InplaceNodeStateManagerImpl) ProcessUpgradeRequiredNodes( "maximum nodes that can be unavailable", maxUnavailable) for _, nodeState := range currentClusterState.NodeStates[UpgradeStateUpgradeRequired] { - if m.IsUpgradeRequested(nodeState.Node) { + upgradeRequested := m.IsUpgradeRequested(nodeState.Node) + if upgradeRequested { // Make sure to remove the upgrade-requested annotation err := m.NodeUpgradeStateProvider.ChangeNodeUpgradeAnnotation(ctx, nodeState.Node, GetUpgradeRequestedAnnotationKey(), "null") @@ -96,14 +99,26 @@ func (m *InplaceNodeStateManagerImpl) ProcessUpgradeRequiredNodes( } } - err := m.NodeUpgradeStateProvider.ChangeNodeUpgradeState(ctx, nodeState.Node, UpgradeStateCordonRequired) + targetState, terr := m.nextStateForUpgradeRequiredNode(ctx, nodeState, upgradeRequested) + if terr != nil { + // Keep the node in upgrade-required and retry on the next reconcile instead + // of starting a full upgrade. + m.Log.V(consts.LogLevelError).Error(terr, + "could not determine next upgrade state; node kept in upgrade-required for retry", + "node", nodeState.Node.Name) + logEventf(m.EventRecorder, nodeState.Node, corev1.EventTypeWarning, GetEventReason(), + "%v, will retry", terr) + continue + } + + err := m.NodeUpgradeStateProvider.ChangeNodeUpgradeState(ctx, nodeState.Node, targetState) if err == nil { upgradesAvailable-- - m.Log.V(consts.LogLevelInfo).Info("Node waiting for cordon", - "node", nodeState.Node.Name) + m.Log.V(consts.LogLevelInfo).Info("Node moving to next upgrade state", + "node", nodeState.Node.Name, "state", targetState) } else { m.Log.V(consts.LogLevelError).Error( - err, "Failed to change node upgrade state", "state", UpgradeStateCordonRequired) + err, "Failed to change node upgrade state", "state", targetState) return err } } @@ -111,6 +126,57 @@ func (m *InplaceNodeStateManagerImpl) ProcessUpgradeRequiredNodes( return nil } +// nextStateForUpgradeRequiredNode determines the state a node in upgrade-required moves to. +// It returns UpgradeStatePodRestartRequired when a registered restart-only predicate matches +// (after cordoning the node), and UpgradeStateCordonRequired for the full upgrade flow otherwise. +// A non-nil error means the decision could not be made; the caller keeps the node in +// upgrade-required and retries on the next reconcile. +func (m *InplaceNodeStateManagerImpl) nextStateForUpgradeRequiredNode( + ctx context.Context, nodeState *NodeUpgradeState, upgradeRequested bool) (string, error) { + restartOnly, err := m.shouldRestartOnly(ctx, nodeState, upgradeRequested) + if err != nil { + return "", err + } + if !restartOnly { + return UpgradeStateCordonRequired, nil + } + // Restart-only change: cordon the node so it stays unschedulable if the pod restart fails, as in + // the full upgrade flow, then restart the driver pod without evicting workloads. + m.Log.V(consts.LogLevelInfo).Info( + "Restart-only change detected; cordoning node and restarting driver pod in place, "+ + "skipping pod-deletion and drain", "node", nodeState.Node.Name) + if err := m.CordonManager.Cordon(ctx, nodeState.Node); err != nil { + return "", fmt.Errorf("failed to cordon node for restart-only upgrade: %w", err) + } + return UpgradeStatePodRestartRequired, nil +} + +// shouldRestartOnly reports whether the node qualifies for an in-place driver pod restart instead +// of the full upgrade flow. It is false when no predicate is registered, for orphaned pods, for +// nodes that explicitly requested an upgrade, and for nodes waiting for safe driver load (which +// must take the full flow so workloads are evicted before the load is unblocked at +// pod-restart-required). +func (m *InplaceNodeStateManagerImpl) shouldRestartOnly( + ctx context.Context, nodeState *NodeUpgradeState, upgradeRequested bool) (bool, error) { + if m.restartOnlyPredicate == nil || upgradeRequested || nodeState.IsOrphanedPod() || + nodeState.DriverPod == nil { + return false, nil + } + waitingForSafeLoad, err := m.SafeDriverLoadManager.IsWaitingForSafeDriverLoad(ctx, nodeState.Node) + if err != nil { + return false, fmt.Errorf("failed to check safe driver load status: %w", err) + } + if waitingForSafeLoad { + return false, nil + } + restartOnly, err := m.restartOnlyPredicate(&nodeState.DriverPod.Spec, + &nodeState.DriverDaemonSet.Spec.Template.Spec) + if err != nil { + return false, fmt.Errorf("failed to evaluate restart-only predicate: %w", err) + } + return restartOnly, nil +} + // ProcessNodeMaintenanceRequiredNodes is a used to satisfy ProcessNodeStateManager interface func (m *InplaceNodeStateManagerImpl) ProcessNodeMaintenanceRequiredNodes(ctx context.Context, currentClusterState *ClusterUpgradeState) error { diff --git a/vendor/github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/upgrade_state.go b/vendor/github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/upgrade_state.go index 4c1d626937..bb94207488 100644 --- a/vendor/github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/upgrade_state.go +++ b/vendor/github.com/NVIDIA/k8s-operator-libs/pkg/upgrade/upgrade_state.go @@ -40,6 +40,9 @@ type ClusterUpgradeStateManager interface { // WithValidationEnabled provides an option to enable the optional 'validation' state // and pass a podSelector to specify which pods are performing the validation WithValidationEnabled(podSelector string) ClusterUpgradeStateManager + // WithRestartOnlyPredicate registers an optional predicate (see RestartOnlyPredicate); + // a nil predicate, the default, keeps the full upgrade flow for every out-of-sync node. + WithRestartOnlyPredicate(predicate RestartOnlyPredicate) ClusterUpgradeStateManager // BuildState builds a point-in-time snapshot of the driver upgrade state in the cluster. BuildState(ctx context.Context, namespace string, driverLabels map[string]string) (*ClusterUpgradeState, error) @@ -349,6 +352,14 @@ func (m *ClusterUpgradeStateManagerImpl) WithValidationEnabled(podSelector strin return m } +// WithRestartOnlyPredicate registers an optional restart-only predicate; a nil predicate +// preserves the default full upgrade flow for every out-of-sync node. +func (m *ClusterUpgradeStateManagerImpl) WithRestartOnlyPredicate( + predicate RestartOnlyPredicate) ClusterUpgradeStateManager { + m.restartOnlyPredicate = predicate + return m +} + // buildNodeUpgradeState creates a mapping between a node, // the driver POD running on them and the daemon set, controlling this pod func (m *ClusterUpgradeStateManagerImpl) buildNodeUpgradeState( diff --git a/vendor/modules.txt b/vendor/modules.txt index cb48f77fbe..4c390f4b5b 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -36,7 +36,7 @@ github.com/NVIDIA/go-nvlib/pkg/pciids # github.com/NVIDIA/k8s-kata-manager v0.2.3 ## explicit; go 1.23.0 github.com/NVIDIA/k8s-kata-manager/api/v1alpha1/config -# github.com/NVIDIA/k8s-operator-libs v0.0.0-20260505175649-fa6a3643c441 +# github.com/NVIDIA/k8s-operator-libs v0.0.0-20260629200812-d720f2557494 ## explicit; go 1.26.0 github.com/NVIDIA/k8s-operator-libs/api/upgrade/v1alpha1 github.com/NVIDIA/k8s-operator-libs/pkg/consts