Skip to content

Commit

Permalink
Add selectors to alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
frederiko committed Mar 3, 2023
1 parent 07ee070 commit ed370f9
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 18 deletions.
2 changes: 1 addition & 1 deletion alerts/kubelet.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@
{
alert: 'KubeNodeReadinessFlapping',
expr: |||
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2
sum(changes(kube_node_status_condition{%(kubeStateMetricsSelector)s,status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2
||| % $._config,
'for': '15m',
labels: {
Expand Down
4 changes: 2 additions & 2 deletions alerts/resource_alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@
} +
if $._config.showMultiCluster then {
expr: |||
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s)) > 0
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(kubeStateMetricsSelector)s,%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
and
(sum(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s)) > 0
(sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
||| % $._config,
annotations+: {
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
Expand Down
5 changes: 3 additions & 2 deletions alerts/system_alerts.libsonnet
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
_config+:: {
notKubeDnsCoreDnsSelector: 'job!~"kube-dns|coredns"',
kubeApiserverSelector: 'job="kube-apiserver"',
},

prometheusAlerts+:: {
Expand Down Expand Up @@ -28,9 +29,9 @@
// this is normal and an expected error, therefore it should be
// ignored in this alert.
expr: |||
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (%(clusterLabel)s, instance, job, namespace)
(sum(rate(rest_client_requests_total{%(kubeApiserverSelector)s,code=~"5.."}[5m])) by (%(clusterLabel)s, instance, job, namespace)
/
sum(rate(rest_client_requests_total[5m])) by (%(clusterLabel)s, instance, job, namespace))
sum(rate(rest_client_requests_total{%(kubeApiserverSelector)s}[5m])) by (%(clusterLabel)s, instance, job, namespace))
> 0.01
||| % $._config,
'for': '15m',
Expand Down
2 changes: 1 addition & 1 deletion rules/kubelet.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
{
record: 'node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile',
expr: |||
histogram_quantile(%(quantile)s, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (%(clusterLabel)s, instance, le) * on(%(clusterLabel)s, instance) group_left(node) kubelet_node_name{%(kubeletSelector)s})
histogram_quantile(%(quantile)s, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{%(kubeletSelector)s}[5m])) by (%(clusterLabel)s, instance, le) * on(%(clusterLabel)s, instance) group_left(node) kubelet_node_name{%(kubeletSelector)s})
||| % ({ quantile: quantile } + $._config),
labels: {
quantile: quantile,
Expand Down
24 changes: 12 additions & 12 deletions tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -518,29 +518,29 @@ tests:
- interval: 1m
input_series:
# Create a histogram where all of the last 10 samples are in the +Inf (> 10 seconds) bucket.
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="0.005", instance="10.0.2.15:10250"}'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.005", instance="10.0.2.15:10250"}'
values: '1+0x10'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="0.01", instance="10.0.2.15:10250"}'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.01", instance="10.0.2.15:10250"}'
values: '1+0x10'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="0.025", instance="10.0.2.15:10250"}'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.025", instance="10.0.2.15:10250"}'
values: '1+0x10'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="0.05", instance="10.0.2.15:10250"}'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.05", instance="10.0.2.15:10250"}'
values: '1+0x10'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="0.1", instance="10.0.2.15:10250"}'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.1", instance="10.0.2.15:10250"}'
values: '1+0x10'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="0.25", instance="10.0.2.15:10250"}'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.25", instance="10.0.2.15:10250"}'
values: '1+0x10'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="0.5", instance="10.0.2.15:10250"}'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="0.5", instance="10.0.2.15:10250"}'
values: '1+0x10'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="1", instance="10.0.2.15:10250"}'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="1", instance="10.0.2.15:10250"}'
values: '1+0x10'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="2.5", instance="10.0.2.15:10250"}'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="2.5", instance="10.0.2.15:10250"}'
values: '1+0x10'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="5", instance="10.0.2.15:10250"}'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="5", instance="10.0.2.15:10250"}'
values: '1+0x10'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="10", instance="10.0.2.15:10250"}'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="10", instance="10.0.2.15:10250"}'
values: '1+0x10'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kublet", le="+Inf", instance="10.0.2.15:10250"}'
- series: 'kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", le="+Inf", instance="10.0.2.15:10250"}'
values: '30+1x10'
- series: 'kubelet_node_name{endpoint="https-metrics",instance="10.0.2.15:10250",job="kubelet",namespace="kube-system",node="minikube",service="kubelet"}'
values: '1 1 1 1 1 1 1 1 1 1'
Expand Down

0 comments on commit ed370f9

Please sign in to comment.