Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,14 @@ type ServiceMonitorConfig struct {
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Relabelings allows to rewrite labels on metric sets"
Relabelings []*promv1.RelabelConfig `json:"relabelings,omitempty"`

// APIGroup is the API group used for ServiceMonitor CRDs.
// Defaults to monitoring.coreos.com. Set to azmonitoring.coreos.com on AKS
// clusters using Azure Managed Prometheus.
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="API group for ServiceMonitor CRDs"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
APIGroup string `json:"apiGroup,omitempty"`
}

// The Alias for backward compatibility
Expand Down Expand Up @@ -2446,6 +2454,21 @@ func (sm *ServiceMonitorConfig) IsEnabled() bool {
return *sm.Enabled
}

// GetAPIGroup returns the API group for ServiceMonitor CRDs.
func (sm *ServiceMonitorConfig) GetAPIGroup() string {
if sm != nil && sm.APIGroup != "" {
return sm.APIGroup
}
return DefaultMonitoringAPIGroup
}

const (
// DefaultMonitoringAPIGroup is the default Prometheus Operator API group.
DefaultMonitoringAPIGroup = "monitoring.coreos.com"
// AzureMonitoringAPIGroup is used by Azure Managed Prometheus on AKS.
AzureMonitoringAPIGroup = "azmonitoring.coreos.com"
)

// IsNLSEnabled returns true if NLS should be used for licensing the driver
func (l *DriverLicensingConfigSpec) IsNLSEnabled() bool {
if l.NLSEnabled == nil {
Expand Down
12 changes: 12 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,18 @@ rules:
- patch
- update
- watch
- apiGroups:
- azmonitoring.coreos.com
resources:
- servicemonitors
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- node.k8s.io
resources:
Expand Down
78 changes: 71 additions & 7 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ import (
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"
Expand Down Expand Up @@ -5049,6 +5051,59 @@ func applyServiceMonitorCustomEdits(desiredState *gpuv1.ServiceMonitorConfig, cu
}
}

func serviceMonitorCRDName(apiGroup string) string {
if apiGroup == "" {
apiGroup = gpuv1.DefaultMonitoringAPIGroup
}
return fmt.Sprintf("servicemonitors.%s", apiGroup)
}

func serviceMonitorGVK(apiGroup string) schema.GroupVersionKind {
if apiGroup == "" {
apiGroup = gpuv1.DefaultMonitoringAPIGroup
}
return schema.GroupVersionKind{Group: apiGroup, Version: "v1", Kind: "ServiceMonitor"}
}

func serviceMonitorConfigForState(n ClusterPolicyController, state string) *gpuv1.ServiceMonitorConfig {
switch state {
case "state-dcgm-exporter":
if n.singleton.Spec.DCGMExporter.ServiceMonitor != nil {
return n.singleton.Spec.DCGMExporter.ServiceMonitor
}
case "state-operator-metrics":
if n.singleton.Spec.Operator.Metrics.ServiceMonitor != nil {
return n.singleton.Spec.Operator.Metrics.ServiceMonitor
}
}
return nil
}

func serviceMonitorAPIGroupForState(n ClusterPolicyController, state string) string {
if sm := serviceMonitorConfigForState(n, state); sm != nil {
return sm.GetAPIGroup()
}
return gpuv1.DefaultMonitoringAPIGroup
}

func toUnstructuredServiceMonitor(sm *promv1.ServiceMonitor, apiGroup string) (*unstructured.Unstructured, error) {
objMap, err := runtime.DefaultUnstructuredConverter.ToUnstructured(sm)
if err != nil {
return nil, err
}
u := &unstructured.Unstructured{Object: objMap}
u.SetGroupVersionKind(serviceMonitorGVK(apiGroup))
return u, nil
}

func deleteServiceMonitor(ctx context.Context, c client.Client, sm *promv1.ServiceMonitor, apiGroup string) error {
u, err := toUnstructuredServiceMonitor(sm, apiGroup)
if err != nil {
return err
}
return c.Delete(ctx, u)
}

// ServiceMonitor creates ServiceMonitor object
func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) {
ctx := n.ctx
Expand All @@ -5058,8 +5113,11 @@ func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) {

logger := n.logger.WithValues("ServiceMonitor", obj.Name, "Namespace", obj.Namespace)

apiGroup := serviceMonitorAPIGroupForState(n, n.stateNames[state])
crdName := serviceMonitorCRDName(apiGroup)

// Check if ServiceMonitor is a valid kind
serviceMonitorCRDExists, err := crdExists(n, ServiceMonitorCRDName)
serviceMonitorCRDExists, err := crdExists(n, crdName)
if err != nil {
return gpuv1.NotReady, err
}
Expand All @@ -5069,7 +5127,7 @@ func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) {
if !serviceMonitorCRDExists {
return gpuv1.Ready, nil
}
err := n.client.Delete(ctx, obj)
err := deleteServiceMonitor(ctx, n.client, obj, apiGroup)
if err != nil && !apierrors.IsNotFound(err) {
logger.Info("Couldn't delete", "Error", err)
return gpuv1.NotReady, err
Expand All @@ -5084,7 +5142,7 @@ func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) {
if !serviceMonitorCRDExists {
return gpuv1.Ready, nil
}
err := n.client.Delete(ctx, obj)
err := deleteServiceMonitor(ctx, n.client, obj, apiGroup)
if err != nil && !apierrors.IsNotFound(err) {
logger.Info("Couldn't delete", "Error", err)
return gpuv1.NotReady, err
Expand Down Expand Up @@ -5138,11 +5196,17 @@ func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) {
return gpuv1.NotReady, err
}

found := &promv1.ServiceMonitor{}
desired, err := toUnstructuredServiceMonitor(obj, apiGroup)
if err != nil {
return gpuv1.NotReady, err
}

found := &unstructured.Unstructured{}
found.SetGroupVersionKind(serviceMonitorGVK(apiGroup))
err = n.client.Get(ctx, types.NamespacedName{Namespace: obj.Namespace, Name: obj.Name}, found)
if err != nil && apierrors.IsNotFound(err) {
logger.Info("Not found, creating...")
err = n.client.Create(ctx, obj)
err = n.client.Create(ctx, desired)
if err != nil {
logger.Info("Couldn't create", "Error", err)
return gpuv1.NotReady, err
Expand All @@ -5153,9 +5217,9 @@ func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) {
}

logger.Info("Found Resource, updating...")
obj.ResourceVersion = found.ResourceVersion
desired.SetResourceVersion(found.GetResourceVersion())

err = n.client.Update(ctx, obj)
err = n.client.Update(ctx, desired)
if err != nil {
logger.Info("Couldn't update", "Error", err)
return gpuv1.NotReady, err
Expand Down
1 change: 1 addition & 0 deletions deployments/gpu-operator/templates/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ rules:
- delete
- apiGroups:
- monitoring.coreos.com
- azmonitoring.coreos.com
resources:
- servicemonitors
- prometheusrules
Expand Down
1 change: 1 addition & 0 deletions deployments/gpu-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ dcgmExporter:
internalTrafficPolicy: Cluster
serviceMonitor:
enabled: true
# apiGroup: azmonitoring.coreos.com # required for Azure Managed Prometheus on AKS
interval: 15s
scrapeTimeout: 10s
honorLabels: false
Expand Down