Skip to content

Commit

Permalink
changes to allow custom labels for ServiceMonitor
Browse files Browse the repository at this point in the history
Signed-off-by: Saurabh Choudhary <csauoss@gmail.com>
  • Loading branch information
csauoss committed Sep 14, 2024
1 parent 941d442 commit 1a0f052
Show file tree
Hide file tree
Showing 5 changed files with 248 additions and 107 deletions.
81 changes: 43 additions & 38 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,40 @@ func (r Runtime) String() string {
}
}

// ServiceMonitorConfig defines configuration options for the ServiceMonitor
// deployed for NVIDIA GPU Operator resources
type ServiceMonitorConfig struct {
// Enabled indicates if ServiceMonitor is deployed
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable deployment of ServiceMonitor"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
Enabled *bool `json:"enabled,omitempty"`

// Interval which metrics should be scraped from. If not specified Prometheus’ global scrape interval is used.
// Supported units: y, w, d, h, m, s, ms
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Interval which metrics should be scraped from"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
Interval promv1.Duration `json:"interval,omitempty"`

// HonorLabels chooses the metric’s labels on collisions with target labels.
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Choose the metric's label on collisions with target labels"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
HonorLabels *bool `json:"honorLabels,omitempty"`

// AdditionalLabels to add to ServiceMonitor instance
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Additional labels to add to ServiceMonitor instance"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
AdditionalLabels map[string]string `json:"additionalLabels,omitempty"`

// Relabelings allows to rewrite labels on metric sets
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Relabelings allows to rewrite labels on metric sets"
Relabelings []*promv1.RelabelConfig `json:"relabelings,omitempty"`
}

// OperatorSpec describes configuration options for the operator
type OperatorSpec struct {
// +kubebuilder:validation:Enum=docker;crio;containerd
Expand All @@ -143,6 +177,11 @@ type OperatorSpec struct {
// queryable and should be preserved when modifying objects.
Annotations map[string]string `json:"annotations,omitempty"`

// Optional: ServiceMonitor configuration for NVIDIA GPU Operator
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="ServiceMonitor configuration for NVIDIA GPU Operator"
ServiceMonitor *ServiceMonitorConfig `json:"serviceMonitor,omitempty"`

// UseOpenShiftDriverToolkit indicates if DriverToolkit image should be used on OpenShift to build and install driver modules
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="On OpenShift, enable DriverToolkit image to build and install driver modules"
Expand Down Expand Up @@ -901,7 +940,7 @@ type DCGMExporterSpec struct {
// Optional: ServiceMonitor configuration for NVIDIA DCGM Exporter
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="ServiceMonitor configuration for NVIDIA DCGM Exporter"
ServiceMonitor *DCGMExporterServiceMonitorConfig `json:"serviceMonitor,omitempty"`
ServiceMonitor *ServiceMonitorConfig `json:"serviceMonitor,omitempty"`
}

// DCGMExporterMetricsConfig defines metrics to be collected by NVIDIA DCGM Exporter
Expand All @@ -914,40 +953,6 @@ type DCGMExporterMetricsConfig struct {
Name string `json:"name,omitempty"`
}

// DCGMExporterServiceMonitorConfig defines configuration options for the ServiceMonitor
// deployed for DCGM Exporter
type DCGMExporterServiceMonitorConfig struct {
// Enabled indicates if ServiceMonitor is deployed for NVIDIA DCGM Exporter
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable deployment of NVIDIA DCGM Exporter ServiceMonitor"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
Enabled *bool `json:"enabled,omitempty"`

// Interval which metrics should be scraped from NVIDIA DCGM Exporter. If not specified Prometheus’ global scrape interval is used.
// Supported units: y, w, d, h, m, s, ms
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Interval which metrics should be scraped from NVDIA DCGM Exporter"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
Interval promv1.Duration `json:"interval,omitempty"`

// HonorLabels chooses the metric’s labels on collisions with target labels.
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Choose the metric's label on collisions with target labels"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
HonorLabels *bool `json:"honorLabels,omitempty"`

// AdditionalLabels to add to ServiceMonitor instance for NVIDIA DCGM Exporter
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Additional labels to add to ServiceMonitor instance for NVIDIA DCGM Exporter"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
AdditionalLabels map[string]string `json:"additionalLabels,omitempty"`

// Relabelings allows to rewrite labels on metric sets for NVIDIA DCGM Exporter
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Relabelings allows to rewrite labels on metric sets for NVIDIA DCGM Exporter"
Relabelings []*promv1.RelabelConfig `json:"relabelings,omitempty"`
}

// DCGMSpec defines the properties for NVIDIA DCGM deployment
type DCGMSpec struct {
// Enabled indicates if deployment of NVIDIA DCGM Hostengine as a separate pod is enabled.
Expand Down Expand Up @@ -2040,10 +2045,10 @@ func (dcgm *DCGMSpec) IsEnabled() bool {
return *dcgm.Enabled
}

// IsEnabled returns true if ServiceMonitor for DCGM Exporter is enabled through gpu-operator
func (sm *DCGMExporterServiceMonitorConfig) IsEnabled() bool {
// IsEnabled returns true if ServiceMonitor is enabled through gpu-operator
func (sm *ServiceMonitorConfig) IsEnabled() bool {
if sm.Enabled == nil {
// ServiceMonitor for DCGM Exporter is disabled by default
// ServiceMonitor is disabled by default
return false
}
return *sm.Enabled
Expand Down
93 changes: 49 additions & 44 deletions api/nvidia/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

124 changes: 120 additions & 4 deletions config/crd/bases/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -389,25 +389,23 @@ spec:
additionalProperties:
type: string
description: AdditionalLabels to add to ServiceMonitor instance
for NVIDIA DCGM Exporter
type: object
enabled:
description: Enabled indicates if ServiceMonitor is deployed
for NVIDIA DCGM Exporter
type: boolean
honorLabels:
description: HonorLabels chooses the metric’s labels on collisions
with target labels.
type: boolean
interval:
description: |-
Interval which metrics should be scraped from NVIDIA DCGM Exporter. If not specified Prometheus’ global scrape interval is used.
Interval which metrics should be scraped from. If not specified Prometheus’ global scrape interval is used.
Supported units: y, w, d, h, m, s, ms
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
relabelings:
description: Relabelings allows to rewrite labels on metric
sets for NVIDIA DCGM Exporter
sets
items:
description: |-
RelabelConfig allows dynamic rewriting of the label set for targets, alerts,
Expand Down Expand Up @@ -1553,6 +1551,124 @@ spec:
runtimeClass:
default: nvidia
type: string
serviceMonitor:
description: 'Optional: ServiceMonitor configuration for NVIDIA
GPU Operator'
properties:
additionalLabels:
additionalProperties:
type: string
description: AdditionalLabels to add to ServiceMonitor instance
type: object
enabled:
description: Enabled indicates if ServiceMonitor is deployed
type: boolean
honorLabels:
description: HonorLabels chooses the metric’s labels on collisions
with target labels.
type: boolean
interval:
description: |-
Interval which metrics should be scraped from. If not specified Prometheus’ global scrape interval is used.
Supported units: y, w, d, h, m, s, ms
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
relabelings:
description: Relabelings allows to rewrite labels on metric
sets
items:
description: |-
RelabelConfig allows dynamic rewriting of the label set for targets, alerts,
scraped samples and remote write samples.
More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config
properties:
action:
default: replace
description: |-
Action to perform based on the regex matching.
`Uppercase` and `Lowercase` actions require Prometheus >= v2.36.0.
`DropEqual` and `KeepEqual` actions require Prometheus >= v2.41.0.
Default: "Replace"
enum:
- replace
- Replace
- keep
- Keep
- drop
- Drop
- hashmod
- HashMod
- labelmap
- LabelMap
- labeldrop
- LabelDrop
- labelkeep
- LabelKeep
- lowercase
- Lowercase
- uppercase
- Uppercase
- keepequal
- KeepEqual
- dropequal
- DropEqual
type: string
modulus:
description: |-
Modulus to take of the hash of the source label values.
Only applicable when the action is `HashMod`.
format: int64
type: integer
regex:
description: Regular expression against which the extracted
value is matched.
type: string
replacement:
description: |-
Replacement value against which a Replace action is performed if the
regular expression matches.
Regex capture groups are available.
type: string
separator:
description: Separator is the string between concatenated
SourceLabels.
type: string
sourceLabels:
description: |-
The source labels select values from existing labels. Their content is
concatenated using the configured Separator and matched against the
configured regular expression.
items:
description: |-
LabelName is a valid Prometheus label name which may only contain ASCII
letters, numbers, as well as underscores.
pattern: ^[a-zA-Z_][a-zA-Z0-9_]*$
type: string
type: array
targetLabel:
description: |-
Label to which the resulting string is written in a replacement.
It is mandatory for `Replace`, `HashMod`, `Lowercase`, `Uppercase`,
`KeepEqual` and `DropEqual` actions.
Regex capture groups are available.
type: string
type: object
type: array
type: object
use_ocp_driver_toolkit:
description: UseOpenShiftDriverToolkit indicates if DriverToolkit
image should be used on OpenShift to build and install driver
Expand Down
Loading

0 comments on commit 1a0f052

Please sign in to comment.