diff --git a/DESIGN.md b/DESIGN.md index 563582e0c..9bad26427 100644 --- a/DESIGN.md +++ b/DESIGN.md @@ -54,7 +54,7 @@ Jsonnet offers the ability to parameterise configuration, allowing for basic cus alert: "KubePodNotReady", expr: ||| sum by (namespace, pod) ( - kube_pod_status_phase{%(kubeStateMetricsSelector)s, phase!~"Running|Succeeded"} + kube_pod_status_phase{%(kubeStateMetricsSelector)s, phase!~"Running"} ) > $(allowedNotReadyPods)s ||| % $._config, "for": "1h", diff --git a/alerts/apps_alerts.libsonnet b/alerts/apps_alerts.libsonnet index 4359b1a77..8df63da7f 100644 --- a/alerts/apps_alerts.libsonnet +++ b/alerts/apps_alerts.libsonnet @@ -33,7 +33,7 @@ expr: ||| sum by (namespace, pod, %(clusterLabel)s) ( max by(namespace, pod, %(clusterLabel)s) ( - kube_pod_status_phase{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, phase=~"Pending|Unknown|Failed"} + kube_pod_status_phase{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, phase=~"Pending"} ) * on(namespace, pod, %(clusterLabel)s) group_left(owner_kind) topk by(namespace, pod, %(clusterLabel)s) ( 1, max by(namespace, pod, owner_kind, %(clusterLabel)s) (kube_pod_owner{owner_kind!="Job"}) ) diff --git a/rules/apps.libsonnet b/rules/apps.libsonnet index c5a58462f..82bf17463 100644 --- a/rules/apps.libsonnet +++ b/rules/apps.libsonnet @@ -86,10 +86,7 @@ { record: 'cluster:namespace:pod_memory:active:kube_pod_resource_request_or_kube_pod_container_resource_requests', expr: ||| - (kube_pod_resource_request{resource="memory",%(kubeSchedulerSelector)s} or (kube_pod_container_resource_requests{resource="memory",%(kubeStateMetricsSelector)s}) * on (namespace, pod, %(clusterLabel)s) - group_left() max by (namespace, pod, %(clusterLabel)s) ( - (kube_pod_status_phase{phase=~"Pending|Running"} == 1) - ) + (kube_pod_resource_request{resource="memory",%(kubeSchedulerSelector)s} or kube_pod_container_resource_requests{resource="memory",%(kubeStateMetricsSelector)s}) ||| % $._config, }, { @@ -99,8 +96,6 @@ sum by (namespace, pod, %(clusterLabel)s) ( max by (namespace, pod, container, %(clusterLabel)s) ( kube_pod_resource_request{resource="memory",%(kubeSchedulerSelector)s} or kube_pod_container_resource_requests{resource="memory",%(kubeStateMetricsSelector)s} - ) * on(namespace, pod, %(clusterLabel)s) group_left() max by (namespace, pod, %(clusterLabel)s) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ) ) @@ -114,10 +109,7 @@ { record: 'cluster:namespace:pod_cpu:active:kube_pod_resource_request_or_kube_pod_container_resource_requests', expr: ||| - (kube_pod_resource_request{resource="memory",%(kubeSchedulerSelector)s} or (kube_pod_container_resource_requests{resource="cpu",%(kubeStateMetricsSelector)s}) * on (namespace, pod, %(clusterLabel)s) - group_left() max by (namespace, pod, %(clusterLabel)s) ( - (kube_pod_status_phase{phase=~"Pending|Running"} == 1) - ) + (kube_pod_resource_request{resource="memory",%(kubeSchedulerSelector)s} or kube_pod_container_resource_requests{resource="cpu",%(kubeStateMetricsSelector)s}) ||| % $._config, }, { @@ -127,8 +119,6 @@ sum by (namespace, pod, %(clusterLabel)s) ( max by (namespace, pod, container, %(clusterLabel)s) ( kube_pod_resource_request{resource="cpu",%(kubeSchedulerSelector)s} or kube_pod_container_resource_requests{resource="cpu",%(kubeStateMetricsSelector)s} - ) * on(namespace, pod, %(clusterLabel)s) group_left() max by (namespace, pod, %(clusterLabel)s) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ) ) @@ -142,10 +132,7 @@ { record: 'cluster:namespace:pod_memory:active:kube_pod_resource_limit_or_kube_pod_container_resource_limits', expr: ||| - (kube_pod_resource_limit{resource="memory",%(kubeSchedulerSelector)s} or kube_pod_container_resource_limits{resource="memory",%(kubeStateMetricsSelector)s}) * on (namespace, pod, %(clusterLabel)s) - group_left() max by (namespace, pod, %(clusterLabel)s) ( - (kube_pod_status_phase{phase=~"Pending|Running"} == 1) - ) + (kube_pod_resource_limit{resource="memory",%(kubeSchedulerSelector)s} or kube_pod_container_resource_limits{resource="memory",%(kubeStateMetricsSelector)s}) ||| % $._config, }, { @@ -155,8 +142,6 @@ sum by (namespace, pod, %(clusterLabel)s) ( max by (namespace, pod, container, %(clusterLabel)s) ( kube_pod_container_resource_limits{resource="memory",%(kubeStateMetricsSelector)s} - ) * on(namespace, pod, %(clusterLabel)s) group_left() max by (namespace, pod, %(clusterLabel)s) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ) ) @@ -170,10 +155,7 @@ { record: 'cluster:namespace:pod_cpu:active:kube_pod_resource_limit_or_kube_pod_container_resource_limits', expr: ||| - (kube_pod_resource_limit{resource="memory",%(kubeSchedulerSelector)s} or kube_pod_container_resource_limits{resource="cpu",%(kubeStateMetricsSelector)s}) * on (namespace, pod, %(clusterLabel)s) - group_left() max by (namespace, pod, %(clusterLabel)s) ( - (kube_pod_status_phase{phase=~"Pending|Running"} == 1) - ) + (kube_pod_resource_limit{resource="memory",%(kubeSchedulerSelector)s} or kube_pod_container_resource_limits{resource="cpu",%(kubeStateMetricsSelector)s}) ||| % $._config, }, { @@ -183,8 +165,6 @@ sum by (namespace, pod, %(clusterLabel)s) ( max by (namespace, pod, container, %(clusterLabel)s) ( kube_pod_resource_limit{resource="memory",%(kubeSchedulerSelector)s} or kube_pod_container_resource_limits{resource="cpu",%(kubeStateMetricsSelector)s} - ) * on(namespace, pod, %(clusterLabel)s) group_left() max by (namespace, pod, %(clusterLabel)s) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ) ) diff --git a/tests.yaml b/tests.yaml index 71ab61f9a..b1bda890a 100644 --- a/tests.yaml +++ b/tests.yaml @@ -461,23 +461,13 @@ tests: - eval_time: 0m expr: namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum exp_samples: - - value: 0.15 + - value: 0.3 labels: 'namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}' - - eval_time: 0m - expr: namespace_memory:kube_pod_container_resource_requests:sum - exp_samples: - - value: 1.0e+9 - labels: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}' - eval_time: 1m expr: namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum exp_samples: - - value: 0.15 + - value: 0.3 labels: 'namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}' - - eval_time: 1m - expr: namespace_memory:kube_pod_container_resource_requests:sum - exp_samples: - - value: 1.0e+9 - labels: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}' - interval: 1m input_series: @@ -506,23 +496,13 @@ tests: - eval_time: 0m expr: namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum exp_samples: - - value: 0.15 + - value: 0.3 labels: 'namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}' - - eval_time: 0m - expr: namespace_memory:kube_pod_container_resource_requests:sum - exp_samples: - - value: 1.0e+9 - labels: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}' - eval_time: 1m expr: namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum exp_samples: - - value: 0.15 + - value: 0.3 labels: 'namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}' - - eval_time: 1m - expr: namespace_memory:kube_pod_container_resource_requests:sum - exp_samples: - - value: 1.0e+9 - labels: 'namespace_memory:kube_pod_container_resource_requests:sum{cluster="kubernetes",namespace="kube-apiserver"}' - interval: 1m input_series: