diff --git a/controllers/active_config.go b/controllers/active_config.go new file mode 100644 index 000000000..7c6c783cd --- /dev/null +++ b/controllers/active_config.go @@ -0,0 +1,51 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package controllers + +import ( + "context" + "fmt" + + "sigs.k8s.io/controller-runtime/pkg/client" + + gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" + nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" +) + +// resolveActiveConfig determines which cluster-wide configuration drives the operands: a +// ClusterPolicy takes precedence, otherwise the DRA-based GPUClusterConfig. The two CRs are +// mutually exclusive, so at most one of the returned values is non-nil; both nil means +// neither CR exists. +func resolveActiveConfig(ctx context.Context, c client.Client) (*gpuv1.ClusterPolicy, *nvidiav1alpha1.GPUClusterConfig, error) { + clusterPolicies := &gpuv1.ClusterPolicyList{} + if err := c.List(ctx, clusterPolicies); err != nil { + return nil, nil, fmt.Errorf("failed to list ClusterPolicy: %w", err) + } + if len(clusterPolicies.Items) > 0 { + return &clusterPolicies.Items[0], nil, nil + } + + gpuClusterConfigs := &nvidiav1alpha1.GPUClusterConfigList{} + if err := c.List(ctx, gpuClusterConfigs); err != nil { + return nil, nil, fmt.Errorf("failed to list GPUClusterConfig: %w", err) + } + if len(gpuClusterConfigs.Items) > 0 { + return nil, &gpuClusterConfigs.Items[0], nil + } + + return nil, nil, nil +} diff --git a/controllers/gpuclusterconfig_controller.go b/controllers/gpuclusterconfig_controller.go index f7410e54d..3986060d2 100644 --- a/controllers/gpuclusterconfig_controller.go +++ b/controllers/gpuclusterconfig_controller.go @@ -34,7 +34,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" "sigs.k8s.io/controller-runtime/pkg/source" - gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" "github.com/NVIDIA/gpu-operator/controllers/clusterinfo" "github.com/NVIDIA/gpu-operator/internal/conditions" @@ -82,11 +81,11 @@ func (r *GPUClusterConfigReconciler) Reconcile(ctx context.Context, req ctrl.Req // GPUClusterConfig (DRA path) is mutually exclusive with ClusterPolicy: if one // exists, yield to it rather than deploying the DRA stack alongside it. - clusterPolicies := &gpuv1.ClusterPolicyList{} - if err := r.List(ctx, clusterPolicies); err != nil { - return ctrl.Result{}, fmt.Errorf("error listing ClusterPolicies: %w", err) + clusterPolicy, _, err := resolveActiveConfig(ctx, r.Client) + if err != nil { + return ctrl.Result{}, err } - if len(clusterPolicies.Items) > 0 { + if clusterPolicy != nil { logger.V(consts.LogLevelWarning).Info("ClusterPolicy present, skipping mutually exclusive GPUClusterConfig") if err := r.updateCrStatus(ctx, instance, nvidiav1alpha1.Disabled); err != nil { return ctrl.Result{}, err @@ -95,7 +94,10 @@ func (r *GPUClusterConfigReconciler) Reconcile(ctx context.Context, req ctrl.Req if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, msg); condErr != nil { logger.Error(condErr, "failed to set condition") } - return ctrl.Result{}, nil + // Requeue so the ClusterPolicy's deletion is noticed and the instance + // recovers; nothing watches ClusterPolicy here, mirroring the ready-path + // resync below. + return ctrl.Result{RequeueAfter: time.Minute}, nil } // Singleton, first-wins (mirroring ClusterPolicy): the first instance to reconcile diff --git a/controllers/gpuclusterconfig_controller_test.go b/controllers/gpuclusterconfig_controller_test.go index 83a3885e7..cfa1591c1 100644 --- a/controllers/gpuclusterconfig_controller_test.go +++ b/controllers/gpuclusterconfig_controller_test.go @@ -61,14 +61,17 @@ func newGPUClusterConfigReconciler(t *testing.T, objs ...client.Object) (*GPUClu } // fakeStateManager returns canned SyncState results so the controller tests don't load -// real manifests. GetWatchSources is promoted from the embedded (nil) interface and is -// never called here — only SetupWithManager calls it, which these tests skip. +// real manifests. It records the last info catalog passed to SyncState so tests can +// assert on its entries. GetWatchSources is promoted from the embedded (nil) interface +// and is never called here; only SetupWithManager calls it, which these tests skip. type fakeStateManager struct { state.Manager - results state.Results + results state.Results + lastCatalog state.InfoCatalog } -func (f *fakeStateManager) SyncState(_ context.Context, _ interface{}, _ state.InfoCatalog) state.Results { +func (f *fakeStateManager) SyncState(_ context.Context, _ interface{}, catalog state.InfoCatalog) state.Results { + f.lastCatalog = catalog return f.results } @@ -114,14 +117,22 @@ func TestGPUClusterConfigReconcileNotFound(t *testing.T) { // A ClusterPolicy in the cluster disables the GPUClusterConfig: the two paths are // mutually exclusive, so the DRA stack is not deployed alongside ClusterPolicy. +// The result requeues so the instance recovers once the ClusterPolicy is removed. func TestGPUClusterConfigDisabledByClusterPolicy(t *testing.T) { cfg := &nvidiav1alpha1.GPUClusterConfig{ObjectMeta: metav1.ObjectMeta{Name: "config"}} cp := &gpuv1.ClusterPolicy{ObjectMeta: metav1.ObjectMeta{Name: "cluster-policy"}} r, c := newGPUClusterConfigReconciler(t, cfg, cp) - gccReconcile(t, r, cfg.Name) + res, err := r.Reconcile(t.Context(), gccRequest(cfg.Name)) + require.NoError(t, err) + require.Positive(t, res.RequeueAfter, "disabled instance must requeue to detect ClusterPolicy removal") require.Equal(t, nvidiav1alpha1.Disabled, gccState(t, c, cfg.Name)) + + // Removing the ClusterPolicy lets the next reconcile recover the instance. + require.NoError(t, c.Delete(t.Context(), cp)) + gccReconcile(t, r, cfg.Name) + require.Equal(t, nvidiav1alpha1.Ready, gccState(t, c, cfg.Name)) } // First-reconciled wins (mirroring ClusterPolicy): whichever instance reconciles first diff --git a/controllers/nodelabeling_controller.go b/controllers/nodelabeling_controller.go index 4fe4641c2..28e98b2aa 100644 --- a/controllers/nodelabeling_controller.go +++ b/controllers/nodelabeling_controller.go @@ -53,12 +53,15 @@ type NodeLabelingReconciler struct { } // nodeLabelingController holds per-reconcile state so that helper methods don't need to -// re-receive that state as arguments. +// re-receive that state as arguments. Exactly one of clusterPolicy or gpuClusterConfig is +// set, selecting the ClusterPolicy stack (the default) or the DRA-based GPUClusterConfig +// stack; the two CRs are mutually exclusive. type nodeLabelingController struct { - client client.Client - namespace string - clusterPolicy *gpuv1.ClusterPolicy - logger logr.Logger + client client.Client + namespace string + clusterPolicy *gpuv1.ClusterPolicy + gpuClusterConfig *nvidiav1alpha1.GPUClusterConfig + logger logr.Logger } // +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;update;patch @@ -67,34 +70,34 @@ type nodeLabelingController struct { func (r *NodeLabelingReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { r.Log.Info("Reconciling node labels") - clusterPolicyList := &gpuv1.ClusterPolicyList{} - if err := r.List(ctx, clusterPolicyList); err != nil { - return reconcile.Result{}, fmt.Errorf("failed to list ClusterPolicy: %w", err) + // Default to the ClusterPolicy stack; fall back to the GPUClusterConfig (DRA) stack when + // no ClusterPolicy exists. Neither existing means there is nothing to label. + clusterPolicy, gpuClusterConfig, err := resolveActiveConfig(ctx, r.Client) + if err != nil { + return reconcile.Result{}, err } - - // (cdesiniotis) Return early if a ClusterPolicy CR does not exist. - // This means that nodes will not get labeled unless a ClusterPolicy - // CR has been created. This may be relaxed in the future when the - // NVIDIA DRA Driver for GPUs is integrated with the GPU Operator - // and new CRDs are introduced. - if len(clusterPolicyList.Items) == 0 { - r.Log.Info("No ClusterPolicy CR exists, skipping node labeling") + if clusterPolicy == nil && gpuClusterConfig == nil { + r.Log.Info("No ClusterPolicy or GPUClusterConfig CR exists, skipping node labeling") return reconcile.Result{}, nil } - clusterPolicy := &clusterPolicyList.Items[0] nlc := &nodeLabelingController{ - client: r.Client, - namespace: r.Namespace, - clusterPolicy: clusterPolicy, - logger: r.Log, + client: r.Client, + namespace: r.Namespace, + clusterPolicy: clusterPolicy, + gpuClusterConfig: gpuClusterConfig, + logger: r.Log, } if err := nlc.labelGPUNodes(ctx); err != nil { return reconcile.Result{}, err } - if nlc.clusterPolicy.Spec.Driver.UseNvidiaDriverCRDType() { + // Route each GPU node to its NVIDIADriver CR. Skipping this leaves the NVIDIADriver controller owning no nodes, and it + // then removes the driver DaemonSet. + usesNvidiaDriverCRD := nlc.gpuClusterConfig != nil || + (nlc.clusterPolicy != nil && nlc.clusterPolicy.Spec.Driver.UseNvidiaDriverCRDType()) + if usesNvidiaDriverCRD { if _, err := nvidiadriverutil.AssignOwners(ctx, r.Client); err != nil { return reconcile.Result{}, fmt.Errorf("failed to assign NVIDIADriver owners to nodes: %w", err) } @@ -103,8 +106,11 @@ func (r *NodeLabelingReconciler) Reconcile(ctx context.Context, req ctrl.Request } } - if err := nlc.applyDriverAutoUpgradeAnnotation(ctx); err != nil { - return reconcile.Result{}, err + // The driver auto-upgrade annotation is derived from ClusterPolicy spec. + if nlc.clusterPolicy != nil { + if err := nlc.applyDriverAutoUpgradeAnnotation(ctx); err != nil { + return reconcile.Result{}, err + } } return reconcile.Result{}, nil @@ -160,6 +166,10 @@ func (nlc *nodeLabelingController) reconcileCommonGPULabel(labels map[string]str // appropriate. If the node does not have the common GPU label, all state labels are removed. // Returns true if labels were modified. func (nlc *nodeLabelingController) updateGPUStateLabels(labels map[string]string, nodeName string) bool { + if nlc.gpuClusterConfig != nil { + return updateGPUClusterConfigStateLabels(labels) + } + if !hasCommonGPULabel(labels) { return removeAllGPUStateLabels(labels) } @@ -200,6 +210,31 @@ func (nlc *nodeLabelingController) updateGPUStateLabels(labels map[string]string return modified } +// updateGPUClusterConfigStateLabels is the GPUClusterConfig analogue of the ClusterPolicy +// gpuWorkloadConfiguration state-label logic: it sets the DRA operand deploy labels on a GPU +// node and removes them once the GPUs are gone. Like the ClusterPolicy path it honors an +// existing value (set only when absent) so the k8s-driver-manager can pause an operand by +// flipping its label to drain it off a node during a driver reload. Returns true if modified. +func updateGPUClusterConfigStateLabels(labels map[string]string) bool { + modified := false + if !hasCommonGPULabel(labels) { + for key := range gpuClusterConfigStateLabels { + if _, ok := labels[key]; ok { + delete(labels, key) + modified = true + } + } + return modified + } + for key, value := range gpuClusterConfigStateLabels { + if _, ok := labels[key]; !ok { + labels[key] = value + modified = true + } + } + return modified +} + func (nlc *nodeLabelingController) setDriverAutoUpgradeAnnotation(ctx context.Context, node *corev1.Node, autoUpgradeEnabled bool) error { annotationValue, annotationExists := node.Annotations[driverAutoUpgradeAnnotationKey] updateRequired := false @@ -383,6 +418,20 @@ func (r *NodeLabelingReconciler) SetupWithManager(ctx context.Context, mgr ctrl. return fmt.Errorf("error watching ClusterPolicy: %w", err) } + // Watch GPUClusterConfig so GPU nodes are (re)labeled for the DRA stack as the CR is + // created or removed, mirroring the ClusterPolicy watch above. + gpuClusterConfigMapFn := func(ctx context.Context, gcc *nvidiav1alpha1.GPUClusterConfig) []reconcile.Request { + return mapToSingleton(ctx, gcc) + } + if err := c.Watch(source.Kind( + mgr.GetCache(), + &nvidiav1alpha1.GPUClusterConfig{}, + handler.TypedEnqueueRequestsFromMapFunc(gpuClusterConfigMapFn), + predicate.TypedGenerationChangedPredicate[*nvidiav1alpha1.GPUClusterConfig]{}, + )); err != nil { + return fmt.Errorf("error watching GPUClusterConfig: %w", err) + } + // Watch NVIDIADriver including delete events so owner labels are cleaned up promptly. nvidiaDriverMapFn := func(ctx context.Context, nd *nvidiav1alpha1.NVIDIADriver) []reconcile.Request { return mapToSingleton(ctx, nd) diff --git a/controllers/nodelabeling_controller_test.go b/controllers/nodelabeling_controller_test.go index d2253bfaa..c738c089b 100644 --- a/controllers/nodelabeling_controller_test.go +++ b/controllers/nodelabeling_controller_test.go @@ -31,6 +31,7 @@ import ( "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/reconcile" gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" @@ -581,3 +582,126 @@ func TestLabelNodesWithOrphanedDriverPods(t *testing.T) { }) } } + +const gpuPCILabelKey = "feature.node.kubernetes.io/pci-10de.present" + +func TestUpdateGPUClusterConfigStateLabels(t *testing.T) { + tests := []struct { + name string + initialLabels map[string]string + expectedLabels map[string]string + expectModified bool + }{ + { + name: "GPU node gets the DRA operand deploy labels", + initialLabels: map[string]string{commonGPULabelKey: commonGPULabelValue}, + expectedLabels: map[string]string{ + commonGPULabelKey: commonGPULabelValue, + driverDeployLabelKey: "true", + draDriverDeployLabelKey: "true", + gfdDeployLabelKey: "true", + }, + expectModified: true, + }, + { + name: "node without the common GPU label has nothing to remove", + initialLabels: map[string]string{"kubernetes.io/hostname": "plain"}, + expectedLabels: map[string]string{"kubernetes.io/hostname": "plain"}, + expectModified: false, + }, + { + name: "node that lost its GPUs has the deploy labels removed", + initialLabels: map[string]string{ + commonGPULabelKey: "false", + driverDeployLabelKey: "true", + draDriverDeployLabelKey: "true", + gfdDeployLabelKey: "true", + }, + expectedLabels: map[string]string{commonGPULabelKey: "false"}, + expectModified: true, + }, + { + name: "GPU node missing some deploy labels is converged", + initialLabels: map[string]string{ + commonGPULabelKey: commonGPULabelValue, + driverDeployLabelKey: "true", + }, + expectedLabels: map[string]string{ + commonGPULabelKey: commonGPULabelValue, + driverDeployLabelKey: "true", + draDriverDeployLabelKey: "true", + gfdDeployLabelKey: "true", + }, + expectModified: true, + }, + { + name: "paused deploy labels are honored, not overwritten", + initialLabels: map[string]string{ + commonGPULabelKey: commonGPULabelValue, + driverDeployLabelKey: "false", + draDriverDeployLabelKey: "false", + gfdDeployLabelKey: "false", + }, + expectedLabels: map[string]string{ + commonGPULabelKey: commonGPULabelValue, + driverDeployLabelKey: "false", + draDriverDeployLabelKey: "false", + gfdDeployLabelKey: "false", + }, + expectModified: false, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + labels := mergeLabels(tc.initialLabels) + modified := updateGPUClusterConfigStateLabels(labels) + assert.Equal(t, tc.expectModified, modified) + assert.Equal(t, tc.expectedLabels, labels) + }) + } +} + +func TestReconcileGPUClusterConfigNodeLabels(t *testing.T) { + newReconciler := func(objs ...client.Object) (*NodeLabelingReconciler, client.Client) { + scheme := runtime.NewScheme() + require.NoError(t, corev1.AddToScheme(scheme)) + require.NoError(t, gpuv1.AddToScheme(scheme)) + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + c := fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build() + return &NodeLabelingReconciler{Client: c, Scheme: scheme, Log: logr.Discard()}, c + } + gpuNode := func() *corev1.Node { + return &corev1.Node{ObjectMeta: metav1.ObjectMeta{ + Name: "gpu-node", + Labels: map[string]string{gpuPCILabelKey: "true"}, + }} + } + + t.Run("GPUClusterConfig present and no ClusterPolicy labels the GPU node", func(t *testing.T) { + gcc := &nvidiav1alpha1.GPUClusterConfig{ObjectMeta: metav1.ObjectMeta{Name: "config"}} + r, c := newReconciler(gcc, gpuNode()) + + _, err := r.Reconcile(context.Background(), reconcile.Request{}) + require.NoError(t, err) + + node := &corev1.Node{} + require.NoError(t, c.Get(context.Background(), types.NamespacedName{Name: "gpu-node"}, node)) + assert.Equal(t, commonGPULabelValue, node.Labels[commonGPULabelKey]) + assert.Equal(t, "true", node.Labels[driverDeployLabelKey]) + assert.Equal(t, "true", node.Labels[draDriverDeployLabelKey]) + assert.Equal(t, "true", node.Labels[gfdDeployLabelKey]) + }) + + t.Run("no ClusterPolicy and no GPUClusterConfig leaves the node untouched", func(t *testing.T) { + r, c := newReconciler(gpuNode()) + + _, err := r.Reconcile(context.Background(), reconcile.Request{}) + require.NoError(t, err) + + node := &corev1.Node{} + require.NoError(t, c.Get(context.Background(), types.NamespacedName{Name: "gpu-node"}, node)) + assert.NotContains(t, node.Labels, commonGPULabelKey) + assert.NotContains(t, node.Labels, draDriverDeployLabelKey) + }) +} diff --git a/controllers/nvidiadriver_controller.go b/controllers/nvidiadriver_controller.go index 5dc142de3..eac2b81ba 100644 --- a/controllers/nvidiadriver_controller.go +++ b/controllers/nvidiadriver_controller.go @@ -64,6 +64,7 @@ type NVIDIADriverReconciler struct { //+kubebuilder:rbac:groups=nvidia.com,resources=nvidiadrivers,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=nvidia.com,resources=nvidiadrivers/status,verbs=get;update;patch //+kubebuilder:rbac:groups=nvidia.com,resources=nvidiadrivers/finalizers,verbs=update +//+kubebuilder:rbac:groups=nvidia.com,resources=gpuclusterconfigs,verbs=get;list;watch // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. @@ -98,39 +99,43 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request return reconcile.Result{}, nil } - // Get the singleton NVIDIA ClusterPolicy object in the cluster. - clusterPolicyList := &gpuv1.ClusterPolicyList{} - if err := r.List(ctx, clusterPolicyList); err != nil { - wrappedErr := fmt.Errorf("error getting ClusterPolicy list: %w", err) - logger.Error(err, "error getting ClusterPolicy list") + // Source the cluster-wide host root from the active configuration: a ClusterPolicy takes + // precedence, otherwise the controller runs standalone against the GPUClusterConfig. + clusterPolicy, gpuClusterConfig, err := resolveActiveConfig(ctx, r.Client) + if err != nil { + logger.Error(err, "error resolving active cluster configuration") instance.Status.State = nvidiav1alpha1.NotReady if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil { logger.Error(condErr, "failed to set condition") } - return reconcile.Result{}, wrappedErr + return reconcile.Result{}, err } - if len(clusterPolicyList.Items) == 0 { - err := fmt.Errorf("no ClusterPolicy object found in the cluster") - logger.Error(err, "failed to get ClusterPolicy object") + var hostRoot string + switch { + case clusterPolicy != nil: + // Ensure that ClusterPolicy is configured to use NVIDIADriver CRD + if !clusterPolicy.Spec.Driver.UseNvidiaDriverCRDType() { + msg := "useNvidiaDriverCRD is not enabled in ClusterPolicy" + logger.V(consts.LogLevelWarning).Info("NVIDIADriver reconciliation skipped", "reason", msg) + instance.Status.State = nvidiav1alpha1.Disabled + if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.Reconciled, msg); condErr != nil { + logger.Error(condErr, "failed to set condition") + } + return reconcile.Result{}, nil + } + hostRoot = clusterPolicy.Spec.HostPaths.RootFS + case gpuClusterConfig != nil: + hostRoot = gpuClusterConfig.Spec.HostPaths.RootFS + default: + err := fmt.Errorf("no ClusterPolicy or GPUClusterConfig object found in the cluster") + logger.Error(err, "failed to get a cluster-wide configuration object") instance.Status.State = nvidiav1alpha1.NotReady if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error()); condErr != nil { logger.Error(condErr, "failed to set condition") } return reconcile.Result{}, err } - clusterPolicyInstance := clusterPolicyList.Items[0] - - // Ensure that ClusterPolicy is configured to use NVIDIADriver CRD - if !clusterPolicyInstance.Spec.Driver.UseNvidiaDriverCRDType() { - msg := "useNvidiaDriverCRD is not enabled in ClusterPolicy" - logger.V(consts.LogLevelWarning).Info("NVIDIADriver reconciliation skipped", "reason", msg) - instance.Status.State = nvidiav1alpha1.Disabled - if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.Reconciled, msg); condErr != nil { - logger.Error(condErr, "failed to set condition") - } - return reconcile.Result{}, nil - } // Create a new InfoCatalog which is a generic interface for passing information to state managers infoCatalog := state.NewInfoCatalog() @@ -138,8 +143,8 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request // Add an entry for ClusterInfo, which was collected before the NVIDIADriver controller was started infoCatalog.Add(state.InfoTypeClusterInfo, r.ClusterInfo) - // Add an entry for Clusterpolicy, which is needed to deploy the driver daemonset - infoCatalog.Add(state.InfoTypeClusterPolicyCR, clusterPolicyInstance) + // Add the host root, which is needed to deploy the driver daemonset + infoCatalog.Add(state.InfoTypeHostRoot, hostRoot) // Verify the nodeSelector configured for this NVIDIADriver instance does // not conflict with any other instances. This ensures only one driver @@ -378,6 +383,24 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl. return err } + // Watch for changes to GPUClusterConfig. Whenever an event is generated for + // GPUClusterConfig, enqueue a reconcile request for all NVIDIADriver instances. + gpuClusterConfigMapFn := func(ctx context.Context, _ *nvidiav1alpha1.GPUClusterConfig) []reconcile.Request { + return r.enqueueAllNVIDIADrivers(ctx) + } + + err = c.Watch( + source.Kind( + mgr.GetCache(), + &nvidiav1alpha1.GPUClusterConfig{}, + handler.TypedEnqueueRequestsFromMapFunc(gpuClusterConfigMapFn), + predicate.TypedGenerationChangedPredicate[*nvidiav1alpha1.GPUClusterConfig]{}, + ), + ) + if err != nil { + return err + } + nodePredicate := predicate.TypedFuncs[*corev1.Node]{ CreateFunc: func(e event.TypedCreateEvent[*corev1.Node]) bool { labels := e.Object.GetLabels() diff --git a/controllers/nvidiadriver_controller_test.go b/controllers/nvidiadriver_controller_test.go index c13f9538c..6ecbba78a 100644 --- a/controllers/nvidiadriver_controller_test.go +++ b/controllers/nvidiadriver_controller_test.go @@ -39,6 +39,7 @@ import ( gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" + "github.com/NVIDIA/gpu-operator/internal/state" "github.com/NVIDIA/gpu-operator/internal/validator" ) @@ -223,6 +224,96 @@ func TestReconcile(t *testing.T) { } } +// TestReconcileStandalone covers the no-ClusterPolicy path: the controller falls back +// to the GPUClusterConfig for the cluster-wide configuration, and fails early when +// neither object exists. +func TestReconcileStandalone(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) + require.NoError(t, gpuv1.AddToScheme(scheme)) + + cp := &gpuv1.ClusterPolicy{ + ObjectMeta: metav1.ObjectMeta{Name: "cluster-policy"}, + Spec: gpuv1.ClusterPolicySpec{ + Driver: gpuv1.DriverSpec{ + UseNvidiaDriverCRD: ptr.To(true), + }, + HostPaths: gpuv1.HostPathsSpec{RootFS: "/cp-root"}, + }, + } + gcc := &nvidiav1alpha1.GPUClusterConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "gpu-cluster-config"}, + Spec: nvidiav1alpha1.GPUClusterConfigSpec{ + HostPaths: gpuv1.HostPathsSpec{RootFS: "/gcc-root"}, + }, + } + + tests := []struct { + name string + objects []client.Object + expectedErr string + expectedHostRoot string + }{ + { + name: "no ClusterPolicy, GPUClusterConfig provides the host root", + objects: []client.Object{gcc}, + expectedHostRoot: "/gcc-root", + }, + { + name: "ClusterPolicy preferred over GPUClusterConfig", + objects: []client.Object{cp, gcc}, + expectedHostRoot: "/cp-root", + }, + { + name: "neither ClusterPolicy nor GPUClusterConfig", + expectedErr: "no ClusterPolicy or GPUClusterConfig object found in the cluster", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + driver := &nvidiav1alpha1.NVIDIADriver{ + ObjectMeta: metav1.ObjectMeta{Name: "test-driver"}, + } + + c := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(append([]client.Object{driver}, tc.objects...)...). + WithStatusSubresource(&nvidiav1alpha1.NVIDIADriver{}). + Build() + + updater := &FakeConditionUpdater{} + stateManager := &fakeStateManager{results: state.Results{Status: state.SyncStateReady}} + + reconciler := &NVIDIADriverReconciler{ + Client: c, + Scheme: scheme, + conditionUpdater: updater, + nodeSelectorValidator: &FakeNodeSelectorValidator{}, + stateManager: stateManager, + } + + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: driver.Name}} + _, err := reconciler.Reconcile(context.Background(), req) + + if tc.expectedErr != "" { + require.ErrorContains(t, err, tc.expectedErr) + require.Equal(t, nvidiav1alpha1.NotReady, updater.LastErrorState) + return + } + require.NoError(t, err) + + hostRoot, ok := stateManager.lastCatalog.Get(state.InfoTypeHostRoot).(string) + require.True(t, ok, "info catalog must hold a host root string") + require.Equal(t, tc.expectedHostRoot, hostRoot) + + instance := &nvidiav1alpha1.NVIDIADriver{} + require.NoError(t, c.Get(context.Background(), types.NamespacedName{Name: driver.Name}, instance)) + require.Equal(t, nvidiav1alpha1.Ready, instance.Status.State) + }) + } +} + func TestReconcileConflictSetsNotReadyState(t *testing.T) { scheme := runtime.NewScheme() require.NoError(t, nvidiav1alpha1.AddToScheme(scheme)) diff --git a/controllers/state_manager.go b/controllers/state_manager.go index b046875aa..03bdb3647 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -73,11 +73,16 @@ const ( gpuWorkloadConfigVMVgpu = "vm-vgpu" kubevirtDevicePluginDeployLabelKey = "nvidia.com/gpu.deploy.sandbox-device-plugin" kataDevicePluginDeployLabelKey = "nvidia.com/gpu.deploy.kata-sandbox-device-plugin" - podSecurityLabelPrefix = "pod-security.kubernetes.io/" - podSecurityLevelPrivileged = "privileged" - driverAutoUpgradeAnnotationKey = "nvidia.com/gpu-driver-upgrade-enabled" - commonDriverDaemonsetName = "nvidia-driver-daemonset" - commonVGPUManagerDaemonsetName = "nvidia-vgpu-manager-daemonset" + // Deploy labels shared by the ClusterPolicy gpuStateLabels map and the GPUClusterConfig + // (DRA) node-labeling path, so each key string has a single definition. + driverDeployLabelKey = "nvidia.com/gpu.deploy.driver" + draDriverDeployLabelKey = "nvidia.com/gpu.deploy.dra-driver" + gfdDeployLabelKey = "nvidia.com/gpu.deploy.gpu-feature-discovery" + podSecurityLabelPrefix = "pod-security.kubernetes.io/" + podSecurityLevelPrivileged = "privileged" + driverAutoUpgradeAnnotationKey = "nvidia.com/gpu-driver-upgrade-enabled" + commonDriverDaemonsetName = "nvidia-driver-daemonset" + commonVGPUManagerDaemonsetName = "nvidia-vgpu-manager-daemonset" ) var ( @@ -87,14 +92,14 @@ var ( var gpuStateLabels = map[string]map[string]string{ gpuWorkloadConfigContainer: { - "nvidia.com/gpu.deploy.driver": "true", - "nvidia.com/gpu.deploy.gpu-feature-discovery": "true", - "nvidia.com/gpu.deploy.container-toolkit": "true", - "nvidia.com/gpu.deploy.device-plugin": "true", - "nvidia.com/gpu.deploy.dcgm": "true", - "nvidia.com/gpu.deploy.dcgm-exporter": "true", - "nvidia.com/gpu.deploy.node-status-exporter": "true", - "nvidia.com/gpu.deploy.operator-validator": "true", + driverDeployLabelKey: "true", + gfdDeployLabelKey: "true", + "nvidia.com/gpu.deploy.container-toolkit": "true", + "nvidia.com/gpu.deploy.device-plugin": "true", + "nvidia.com/gpu.deploy.dcgm": "true", + "nvidia.com/gpu.deploy.dcgm-exporter": "true", + "nvidia.com/gpu.deploy.node-status-exporter": "true", + "nvidia.com/gpu.deploy.operator-validator": "true", }, gpuWorkloadConfigVMPassthrough: { "nvidia.com/gpu.deploy.sandbox-device-plugin": "true", @@ -112,6 +117,15 @@ var gpuStateLabels = map[string]map[string]string{ }, } +// gpuClusterConfigStateLabels are the nvidia.com/gpu.deploy.* labels the DRA-based +// GPUClusterConfig operands gate their nodeSelectors on, analogous to gpuStateLabels for +// the ClusterPolicy stack. +var gpuClusterConfigStateLabels = map[string]string{ + driverDeployLabelKey: "true", + draDriverDeployLabelKey: "true", + gfdDeployLabelKey: "true", +} + var gpuNodeLabels = map[string]string{ "feature.node.kubernetes.io/pci-10de.present": "true", "feature.node.kubernetes.io/pci-0302_10de.present": "true", diff --git a/internal/state/driver.go b/internal/state/driver.go index 0488b0efb..8c84639a9 100644 --- a/internal/state/driver.go +++ b/internal/state/driver.go @@ -40,7 +40,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/source" - gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" "github.com/NVIDIA/gpu-operator/controllers/clusterinfo" driverconfig "github.com/NVIDIA/gpu-operator/internal/config" @@ -252,11 +251,14 @@ func (s *stateDriver) cleanupStaleDriverDaemonsets(ctx context.Context, cr *nvid func (s *stateDriver) getManifestObjects(ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver, infoCatalog InfoCatalog) ([]*unstructured.Unstructured, error) { logger := log.FromContext(ctx) - info := infoCatalog.Get(InfoTypeClusterPolicyCR) + info := infoCatalog.Get(InfoTypeHostRoot) if info == nil { - return nil, fmt.Errorf("failed to get ClusterPolicy CR from info catalog") + return nil, fmt.Errorf("failed to get host root from info catalog") + } + hostRoot, ok := info.(string) + if !ok { + return nil, fmt.Errorf("host root in info catalog has unexpected type %T", info) } - clusterPolicy := info.(gpuv1.ClusterPolicy) info = infoCatalog.Get(InfoTypeClusterInfo) if info == nil { @@ -280,7 +282,7 @@ func (s *stateDriver) getManifestObjects(ctx context.Context, cr *nvidiav1alpha1 renderData := &driverRenderData{ GPUDirectRDMA: gpuDirectRDMASpec, Runtime: runtimeSpec, - HostRoot: clusterPolicy.Spec.HostPaths.RootFS, + HostRoot: hostRoot, } if len(nodePools) == 0 { diff --git a/internal/state/driver_test.go b/internal/state/driver_test.go index 2509ad04a..2e57e6e2d 100644 --- a/internal/state/driver_test.go +++ b/internal/state/driver_test.go @@ -149,6 +149,20 @@ func TestDriverRenderMinimal(t *testing.T) { require.Equal(t, string(o), actual) } +func TestDriverRenderMissingHostRoot(t *testing.T) { + state, err := NewStateDriver(nil, "", nil, manifestDir) + require.Nil(t, err) + stateDriver, ok := state.(*stateDriver) + require.True(t, ok) + + catalog := NewInfoCatalog() + catalog.Add(InfoTypeClusterInfo, testClusterInfo{}) + + _, err = stateDriver.getManifestObjects(context.Background(), &nvidiav1alpha1.NVIDIADriver{}, catalog) + require.Error(t, err, "rendering must fail when no host root is in the catalog") + require.Contains(t, err.Error(), "host root") +} + func TestDriverHostNetwork(t *testing.T) { const ( testName = "driver-hostnetwork" diff --git a/internal/state/info_source.go b/internal/state/info_source.go index 4f6409a90..a30b0b6bb 100644 --- a/internal/state/info_source.go +++ b/internal/state/info_source.go @@ -20,7 +20,7 @@ type InfoType uint const ( InfoTypeClusterInfo = iota - InfoTypeClusterPolicyCR + InfoTypeHostRoot ) func NewInfoCatalog() InfoCatalog { diff --git a/manifests/state-gfd/0500_daemonset.yaml b/manifests/state-gfd/0500_daemonset.yaml index a73a142e7..e890405c9 100644 --- a/manifests/state-gfd/0500_daemonset.yaml +++ b/manifests/state-gfd/0500_daemonset.yaml @@ -42,6 +42,10 @@ spec: spec: priorityClassName: system-node-critical serviceAccountName: nvidia-gpu-feature-discovery + # Gate scheduling on the per-component deploy label so the k8s-driver-manager + # can pause it to drain GFD off a node during a driver reload. + nodeSelector: + nvidia.com/gpu.deploy.gpu-feature-discovery: "true" {{- if .GFD.Spec.ImagePullSecrets }} imagePullSecrets: {{- range .GFD.Spec.ImagePullSecrets }}