diff --git a/controllers/state_manager.go b/controllers/state_manager.go index 45b890df2..d4359361f 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -71,6 +71,7 @@ const ( gpuWorkloadConfigContainer = "container" gpuWorkloadConfigVMPassthrough = "vm-passthrough" gpuWorkloadConfigVMVgpu = "vm-vgpu" + driverDeployLabelKey = "nvidia.com/gpu.deploy.driver" kubevirtDevicePluginDeployLabelKey = "nvidia.com/gpu.deploy.sandbox-device-plugin" kataDevicePluginDeployLabelKey = "nvidia.com/gpu.deploy.kata-sandbox-device-plugin" podSecurityLabelPrefix = "pod-security.kubernetes.io/" @@ -395,10 +396,15 @@ func removeAllGPUStateLabels(labels map[string]string) bool { // updateGPUStateLabels returns true if the input labels map is modified. func (w *gpuWorkloadConfiguration) updateGPUStateLabels(labels map[string]string) bool { if hasOperandsDisabled(labels) { - // Operands are disabled, delete all GPU state labels + // Operands are disabled: remove all GPU state labels except the driver label. w.log.Info("Operands are disabled for node", "NodeName", w.node, "Label", commonOperandsLabelKey, "Value", "false") - w.log.Info("Disabling all operands for node", "NodeName", w.node) - return removeAllGPUStateLabels(labels) + w.log.Info("Disabling all operands for node (except the GPU driver)", "NodeName", w.node) + driverLabelValue, hasDriverLabel := labels[driverDeployLabelKey] + modified := removeAllGPUStateLabels(labels) + if hasDriverLabel { + labels[driverDeployLabelKey] = driverLabelValue + } + return modified } removed := w.removeGPUStateLabels(labels) added := w.addGPUStateLabels(labels) diff --git a/controllers/state_manager_test.go b/controllers/state_manager_test.go index 6585de196..0858ed142 100644 --- a/controllers/state_manager_test.go +++ b/controllers/state_manager_test.go @@ -21,6 +21,7 @@ import ( "errors" "testing" + "github.com/go-logr/logr" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -207,6 +208,50 @@ func TestHasOperandsDisabled(t *testing.T) { } } +func TestUpdateGPUStateLabels_OperandsDisabled(t *testing.T) { + log := logr.Discard() + w := &gpuWorkloadConfiguration{config: gpuWorkloadConfigContainer, node: "test-node", log: log} + + t.Run("preserves driver label when operands disabled", func(t *testing.T) { + labels := map[string]string{ + commonOperandsLabelKey: "false", + driverDeployLabelKey: "true", + "nvidia.com/gpu.deploy.container-toolkit": "true", + "nvidia.com/gpu.deploy.device-plugin": "true", + "nvidia.com/gpu.deploy.gpu-feature-discovery": "true", + } + modified := w.updateGPUStateLabels(labels) + require.True(t, modified) + require.Equal(t, "true", labels[driverDeployLabelKey], "driver label must be preserved when operands disabled") + require.NotContains(t, labels, "nvidia.com/gpu.deploy.container-toolkit") + require.NotContains(t, labels, "nvidia.com/gpu.deploy.device-plugin") + require.NotContains(t, labels, "nvidia.com/gpu.deploy.gpu-feature-discovery") + }) + + t.Run("driver label not added if absent when operands disabled", func(t *testing.T) { + labels := map[string]string{ + commonOperandsLabelKey: "false", + "nvidia.com/gpu.deploy.container-toolkit": "true", + } + modified := w.updateGPUStateLabels(labels) + require.True(t, modified) + require.NotContains(t, labels, driverDeployLabelKey, "driver label must not be added if it was not present") + require.NotContains(t, labels, "nvidia.com/gpu.deploy.container-toolkit") + }) + + t.Run("preserves driver label set to false when operands disabled", func(t *testing.T) { + labels := map[string]string{ + commonOperandsLabelKey: "false", + driverDeployLabelKey: "false", + "nvidia.com/gpu.deploy.container-toolkit": "true", + } + modified := w.updateGPUStateLabels(labels) + require.True(t, modified) + require.Equal(t, "false", labels[driverDeployLabelKey], "explicitly-disabled driver label must be preserved") + require.NotContains(t, labels, "nvidia.com/gpu.deploy.container-toolkit") + }) +} + func TestHasNFDLabels(t *testing.T) { tests := []struct { labels map[string]string diff --git a/tests/scripts/verify-disable-operands.sh b/tests/scripts/verify-disable-operands.sh index 66c5bb85c..c0aad843c 100755 --- a/tests/scripts/verify-disable-operands.sh +++ b/tests/scripts/verify-disable-operands.sh @@ -11,8 +11,7 @@ source ${SCRIPT_DIR}/.definitions.sh # Import the check definitions source ${SCRIPT_DIR}/checks.sh -# We verify that all GPU Operator operands have been deleted -check_pod_deleted "nvidia-driver-daemonset" +# We verify that all GPU Operator operands have been deleted, except the driver. check_pod_deleted "nvidia-container-toolkit-daemonset" check_pod_deleted "nvidia-device-plugin-daemonset" check_pod_deleted "nvidia-dcgm-exporter"