From df5724febdf9284580bbeff61978dc52ed0c11b8 Mon Sep 17 00:00:00 2001 From: Wen Zhou Date: Sun, 28 Jan 2024 14:02:40 +0100 Subject: [PATCH] fix: adding logic to apply trustyai prometheus (#173) * update(trustyai): adding logic to monitoring Signed-off-by: Wen Zhou * fix(trustyai): prometheus rules for probe Signed-off-by: Wen Zhou * update(trusty): prometheus to use job instead of instance name for record rules Signed-off-by: Wen Zhou --------- Signed-off-by: Wen Zhou --- .../prometheus/apps/prometheus-configs.yaml | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/config/monitoring/prometheus/apps/prometheus-configs.yaml b/config/monitoring/prometheus/apps/prometheus-configs.yaml index 0217b6c774a..94c9f921928 100644 --- a/config/monitoring/prometheus/apps/prometheus-configs.yaml +++ b/config/monitoring/prometheus/apps/prometheus-configs.yaml @@ -314,19 +314,22 @@ data: metrics_path: /metrics scheme: http kubernetes_sd_configs: - - role: endpoints + - role: pod namespaces: names: - + selectors: + - role: pod + label: 'app.opendatahub.io/trustyai=true' relabel_configs: - - source_labels: [__meta_kubernetes_service_name] - regex: ^(trustyai-service-operator-controller-manager)$ + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_part_of] + regex: ^(trustyai)$ target_label: kubernetes_name action: keep - source_labels: [__address__] regex: (.*) target_label: __address__ - replacement: ${1}:8080 + replacement: ${1}:8080 - job_name: 'RHOAI Metrics' honor_labels: true @@ -1184,7 +1187,7 @@ data: kserve-alerting.rules: | groups: - - name: SLOs-probe_success + - name: SLOs-probe_success_kserve rules: - alert: Kserve Controller Probe Success Burn Rate annotations: @@ -1464,7 +1467,7 @@ data: record: probe_success:burnrate6h trustyai-alerting.rules: | groups: - - name: SLOs-probe_success + - name: SLOs-probe_success_trustyai rules: - alert: TrustyAI Controller Probe Success Burn Rate annotations: @@ -1478,6 +1481,7 @@ data: for: 2m labels: severity: critical + instance: trustyai-service-operator-controller-manager - alert: TrustyAI Controller Probe Success Burn Rate annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' @@ -1490,6 +1494,7 @@ data: for: 15m labels: severity: critical + instance: trustyai-service-operator-controller-manager - alert: TrustyAI Controller Probe Success Burn Rate annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' @@ -1502,3 +1507,4 @@ data: for: 1h labels: severity: warning + instance: trustyai-service-operator-controller-manager