From 1192194a8c886d2d6e59f373428d2bbfc91cf892 Mon Sep 17 00:00:00 2001 From: Mac Chaffee Date: Fri, 8 Apr 2022 18:08:23 -0400 Subject: [PATCH] Add KubePodEvictionRateHigh alert for elevated eviction rates Signed-off-by: Mac Chaffee --- alerts/resource_alerts.libsonnet | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/alerts/resource_alerts.libsonnet b/alerts/resource_alerts.libsonnet index 50d82f9cb..84adda9f3 100644 --- a/alerts/resource_alerts.libsonnet +++ b/alerts/resource_alerts.libsonnet @@ -15,6 +15,8 @@ // See https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#how-can-i-configure-overprovisioning-with-cluster-autoscaler // for more details. ignoringOverprovisionedWorkloadSelector: '', + # Max evictions per second that will trigger an alert. The default value generally allows for only one pod occasionally being evicted. Any more evictions than that will trigger the alert. + highEvictionRateThreshold: 0.002, }, prometheusAlerts+:: { @@ -156,6 +158,20 @@ summary: 'Processes experience elevated CPU throttling.', }, }, + { + alert: 'KubePodEvictionRateHigh', + expr: ||| + sum(rate(kubelet_evictions[15m])) > %(highEvictionRateThreshold)s + ||| % $._config, + labels: { + severity: 'warning', + }, + 'for': '1m', + annotations: { + description: 'Pods are being evicted at an unexpectedly high rate of {{ $value }} pods per second. This is typically caused by pods frequently exceeding RAM/ephemeral-storage limits or by nodes being NotReady for extended periods.', + summary: 'Cluster is evicting pods at an unexpectedly high rate.', + }, + }, ], }, ],