diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index 12f8e9dcde..5fdac040dc 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -176,6 +176,14 @@ type ServiceMonitorConfig struct { // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Relabelings allows to rewrite labels on metric sets" Relabelings []*promv1.RelabelConfig `json:"relabelings,omitempty"` + + // APIGroup is the API group used for ServiceMonitor CRDs. + // Defaults to monitoring.coreos.com. Set to azmonitoring.coreos.com on AKS + // clusters using Azure Managed Prometheus. + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="API group for ServiceMonitor CRDs" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text" + APIGroup string `json:"apiGroup,omitempty"` } // The Alias for backward compatibility @@ -2446,6 +2454,21 @@ func (sm *ServiceMonitorConfig) IsEnabled() bool { return *sm.Enabled } +// GetAPIGroup returns the API group for ServiceMonitor CRDs. +func (sm *ServiceMonitorConfig) GetAPIGroup() string { + if sm != nil && sm.APIGroup != "" { + return sm.APIGroup + } + return DefaultMonitoringAPIGroup +} + +const ( + // DefaultMonitoringAPIGroup is the default Prometheus Operator API group. + DefaultMonitoringAPIGroup = "monitoring.coreos.com" + // AzureMonitoringAPIGroup is used by Azure Managed Prometheus on AKS. + AzureMonitoringAPIGroup = "azmonitoring.coreos.com" +) + // IsNLSEnabled returns true if NLS should be used for licensing the driver func (l *DriverLicensingConfigSpec) IsNLSEnabled() bool { if l.NLSEnabled == nil { diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 7a631b9e3b..844f9498f6 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -117,6 +117,18 @@ rules: - patch - update - watch +- apiGroups: + - azmonitoring.coreos.com + resources: + - servicemonitors + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - node.k8s.io resources: diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 299da2a45b..cde6db64c9 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -39,6 +39,8 @@ import ( apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" @@ -5049,6 +5051,59 @@ func applyServiceMonitorCustomEdits(desiredState *gpuv1.ServiceMonitorConfig, cu } } +func serviceMonitorCRDName(apiGroup string) string { + if apiGroup == "" { + apiGroup = gpuv1.DefaultMonitoringAPIGroup + } + return fmt.Sprintf("servicemonitors.%s", apiGroup) +} + +func serviceMonitorGVK(apiGroup string) schema.GroupVersionKind { + if apiGroup == "" { + apiGroup = gpuv1.DefaultMonitoringAPIGroup + } + return schema.GroupVersionKind{Group: apiGroup, Version: "v1", Kind: "ServiceMonitor"} +} + +func serviceMonitorConfigForState(n ClusterPolicyController, state string) *gpuv1.ServiceMonitorConfig { + switch state { + case "state-dcgm-exporter": + if n.singleton.Spec.DCGMExporter.ServiceMonitor != nil { + return n.singleton.Spec.DCGMExporter.ServiceMonitor + } + case "state-operator-metrics": + if n.singleton.Spec.Operator.Metrics.ServiceMonitor != nil { + return n.singleton.Spec.Operator.Metrics.ServiceMonitor + } + } + return nil +} + +func serviceMonitorAPIGroupForState(n ClusterPolicyController, state string) string { + if sm := serviceMonitorConfigForState(n, state); sm != nil { + return sm.GetAPIGroup() + } + return gpuv1.DefaultMonitoringAPIGroup +} + +func toUnstructuredServiceMonitor(sm *promv1.ServiceMonitor, apiGroup string) (*unstructured.Unstructured, error) { + objMap, err := runtime.DefaultUnstructuredConverter.ToUnstructured(sm) + if err != nil { + return nil, err + } + u := &unstructured.Unstructured{Object: objMap} + u.SetGroupVersionKind(serviceMonitorGVK(apiGroup)) + return u, nil +} + +func deleteServiceMonitor(ctx context.Context, c client.Client, sm *promv1.ServiceMonitor, apiGroup string) error { + u, err := toUnstructuredServiceMonitor(sm, apiGroup) + if err != nil { + return err + } + return c.Delete(ctx, u) +} + // ServiceMonitor creates ServiceMonitor object func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) { ctx := n.ctx @@ -5058,8 +5113,11 @@ func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) { logger := n.logger.WithValues("ServiceMonitor", obj.Name, "Namespace", obj.Namespace) + apiGroup := serviceMonitorAPIGroupForState(n, n.stateNames[state]) + crdName := serviceMonitorCRDName(apiGroup) + // Check if ServiceMonitor is a valid kind - serviceMonitorCRDExists, err := crdExists(n, ServiceMonitorCRDName) + serviceMonitorCRDExists, err := crdExists(n, crdName) if err != nil { return gpuv1.NotReady, err } @@ -5069,7 +5127,7 @@ func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) { if !serviceMonitorCRDExists { return gpuv1.Ready, nil } - err := n.client.Delete(ctx, obj) + err := deleteServiceMonitor(ctx, n.client, obj, apiGroup) if err != nil && !apierrors.IsNotFound(err) { logger.Info("Couldn't delete", "Error", err) return gpuv1.NotReady, err @@ -5084,7 +5142,7 @@ func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) { if !serviceMonitorCRDExists { return gpuv1.Ready, nil } - err := n.client.Delete(ctx, obj) + err := deleteServiceMonitor(ctx, n.client, obj, apiGroup) if err != nil && !apierrors.IsNotFound(err) { logger.Info("Couldn't delete", "Error", err) return gpuv1.NotReady, err @@ -5138,11 +5196,17 @@ func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) { return gpuv1.NotReady, err } - found := &promv1.ServiceMonitor{} + desired, err := toUnstructuredServiceMonitor(obj, apiGroup) + if err != nil { + return gpuv1.NotReady, err + } + + found := &unstructured.Unstructured{} + found.SetGroupVersionKind(serviceMonitorGVK(apiGroup)) err = n.client.Get(ctx, types.NamespacedName{Namespace: obj.Namespace, Name: obj.Name}, found) if err != nil && apierrors.IsNotFound(err) { logger.Info("Not found, creating...") - err = n.client.Create(ctx, obj) + err = n.client.Create(ctx, desired) if err != nil { logger.Info("Couldn't create", "Error", err) return gpuv1.NotReady, err @@ -5153,9 +5217,9 @@ func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) { } logger.Info("Found Resource, updating...") - obj.ResourceVersion = found.ResourceVersion + desired.SetResourceVersion(found.GetResourceVersion()) - err = n.client.Update(ctx, obj) + err = n.client.Update(ctx, desired) if err != nil { logger.Info("Couldn't update", "Error", err) return gpuv1.NotReady, err diff --git a/deployments/gpu-operator/templates/role.yaml b/deployments/gpu-operator/templates/role.yaml index dc4674c575..f233bb3e88 100644 --- a/deployments/gpu-operator/templates/role.yaml +++ b/deployments/gpu-operator/templates/role.yaml @@ -73,6 +73,7 @@ rules: - delete - apiGroups: - monitoring.coreos.com + - azmonitoring.coreos.com resources: - servicemonitors - prometheusrules diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 90efb9fe1a..018cda5cec 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -322,6 +322,7 @@ dcgmExporter: internalTrafficPolicy: Cluster serviceMonitor: enabled: true + # apiGroup: azmonitoring.coreos.com # required for Azure Managed Prometheus on AKS interval: 15s scrapeTimeout: 10s honorLabels: false