From 5e93ed08a9d84aa1e68e9ccb48905b97e05a428a Mon Sep 17 00:00:00 2001 From: Riccardo Freschi Date: Mon, 25 Sep 2023 10:18:25 +0200 Subject: [PATCH 1/4] added Grafana NVIDIA DCGM Exporter Dashboard --- .../eks/gpu/amg_grafana-dashboards.yaml | 11 +++++++++++ .../eks/gpu/kustomization.yaml | 4 ++++ 2 files changed, 15 insertions(+) create mode 100644 artifacts/grafana-operator-manifests/eks/gpu/amg_grafana-dashboards.yaml create mode 100644 artifacts/grafana-operator-manifests/eks/gpu/kustomization.yaml diff --git a/artifacts/grafana-operator-manifests/eks/gpu/amg_grafana-dashboards.yaml b/artifacts/grafana-operator-manifests/eks/gpu/amg_grafana-dashboards.yaml new file mode 100644 index 0000000..a0d4c43 --- /dev/null +++ b/artifacts/grafana-operator-manifests/eks/gpu/amg_grafana-dashboards.yaml @@ -0,0 +1,11 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: gpu-nvidia-dcgm-exporter-dashboard + namespace: grafana-operator +spec: + folder: "Observability Accelerator Dashboards" + instanceSelector: + matchLabels: + dashboards: "external-grafana" + url: "https://grafana.com/api/dashboards/12239/revisions/2/download" diff --git a/artifacts/grafana-operator-manifests/eks/gpu/kustomization.yaml b/artifacts/grafana-operator-manifests/eks/gpu/kustomization.yaml new file mode 100644 index 0000000..85a7882 --- /dev/null +++ b/artifacts/grafana-operator-manifests/eks/gpu/kustomization.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - amg_grafana-dashboards.yaml From db4cf32f6329e87d89ae6e9c3ed63c5e62931064 Mon Sep 17 00:00:00 2001 From: Riccardo Freschi Date: Tue, 26 Sep 2023 12:02:44 +0100 Subject: [PATCH 2/4] mapping datasource variable --- .../eks/gpu/amg_grafana-dashboards.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/artifacts/grafana-operator-manifests/eks/gpu/amg_grafana-dashboards.yaml b/artifacts/grafana-operator-manifests/eks/gpu/amg_grafana-dashboards.yaml index a0d4c43..66fe89c 100644 --- a/artifacts/grafana-operator-manifests/eks/gpu/amg_grafana-dashboards.yaml +++ b/artifacts/grafana-operator-manifests/eks/gpu/amg_grafana-dashboards.yaml @@ -4,8 +4,12 @@ metadata: name: gpu-nvidia-dcgm-exporter-dashboard namespace: grafana-operator spec: + datasources: + - inputName: "DS_PROMETHEUS" + datasourceName: "aws-observability-accelerator" folder: "Observability Accelerator Dashboards" instanceSelector: matchLabels: dashboards: "external-grafana" - url: "https://grafana.com/api/dashboards/12239/revisions/2/download" + grafanaCom: + id: 12239 From 7a6d7be104f3b23d800cf5a2eb11640bdf9b274b Mon Sep 17 00:00:00 2001 From: Riccardo Freschi Date: Tue, 10 Oct 2023 11:49:43 +0100 Subject: [PATCH 3/4] first draft --- .../eks/neuron/neuron-monitor.json | 1607 +++++++++++++++++ .../eks/neuron/amg_grafana-dashboards.yaml | 11 + .../eks/neuron/kustomization.yaml | 4 + 3 files changed, 1622 insertions(+) create mode 100644 artifacts/grafana-dashboards/eks/neuron/neuron-monitor.json create mode 100644 artifacts/grafana-operator-manifests/eks/neuron/amg_grafana-dashboards.yaml create mode 100644 artifacts/grafana-operator-manifests/eks/neuron/kustomization.yaml diff --git a/artifacts/grafana-dashboards/eks/neuron/neuron-monitor.json b/artifacts/grafana-dashboards/eks/neuron/neuron-monitor.json new file mode 100644 index 0000000..fc2d915 --- /dev/null +++ b/artifacts/grafana-dashboards/eks/neuron/neuron-monitor.json @@ -0,0 +1,1607 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 8, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "custom.width", + "value": 163 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Field" + }, + "properties": [ + { + "id": "custom.width", + "value": 450 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "ami_id" + }, + "properties": [ + { + "id": "custom.width", + "value": 217 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "instance_type" + }, + "properties": [ + { + "id": "custom.width", + "value": 391 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Prometheus instance" + }, + "properties": [ + { + "id": "custom.width", + "value": 641 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 8, + "options": { + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "instance{job=\"kubernetes-service-endpoints\"}", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Instance Info", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "__name__": true, + "ami_id": false, + "instance": true, + "job": true + }, + "indexByName": { + "Time": 0, + "Value": 7, + "__name__": 1, + "availability_zone": 8, + "instance": 5, + "instance_id": 2, + "instance_name": 3, + "instance_type": 4, + "job": 6, + "region": 9, + "subnet_id": 10 + }, + "renameByName": { + "Value": "", + "availability_zone": "Availability Zone", + "instance": "", + "instance_id": "Instance ID", + "instance_name": "Instance Name", + "instance_type": "Instance Type", + "region": "Region", + "subnet_id": "Subnet" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "super-light-yellow" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 8 + }, + "id": 36, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "count(instance{job=\"kubernetes-service-endpoints\"})\n", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Instance Count", + "type": "stat" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "light-blue" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 3, + "y": 8 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "sum (system_vcpu_count)", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "vCPU Count", + "type": "stat" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "orange", + "value": 80 + }, + { + "color": "semi-dark-red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 8 + }, + "id": 20, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "avg(sum by (instance_id) (system_vcpu_usage_ratio))", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "vCPU Utilization", + "type": "gauge" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "orange", + "value": 80 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 9, + "y": 8 + }, + "id": 16, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "avg(system_memory_used_bytes / system_memory_total_bytes)", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Host Memory Usage", + "type": "gauge" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(191, 151, 105)" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 8 + }, + "id": 12, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "count(neuroncore_utilization_ratio > 0)", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "NeuronCores in Use", + "transformations": [], + "type": "stat" + }, + { + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red" + }, + { + "color": "orange", + "value": 5 + }, + { + "color": "yellow", + "value": 20 + }, + { + "color": "green", + "value": 35 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 15, + "y": 8 + }, + "id": 4, + "interval": "", + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": true, + "showThresholdMarkers": true + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "avg(neuroncore_utilization_ratio)", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "NeuronCore Utilization", + "type": "gauge" + }, + { + "datasource": { + "uid": "$datasource" + }, + "description": "", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 18, + "y": 8 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "sum(rate(execution_status_total{status_type=\"completed\"}[1m]))", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Execution Success Rate", + "transformations": [], + "type": "stat" + }, + { + "datasource": { + "uid": "$datasource" + }, + "description": "", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 21, + "y": 8 + }, + "id": 18, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "sum(rate(execution_status_total{status_type!=\"completed\"}[1m]))", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Execution Error Rate", + "type": "stat" + }, + { + "aliasColors": { + "Inf Error Rate": "semi-dark-red", + "Inf Success Rate": "light-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 13 + }, + "hiddenSeries": false, + "id": 32, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.4.7", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "sum(rate(execution_status_total{status_type=\"completed\"}[1m]))", + "interval": "", + "legendFormat": "Execution Success Rate", + "refId": "A" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "sum(rate(execution_status_total{status_type!=\"completed\"}[1m]))", + "interval": "", + "legendFormat": "Execution Error Rate", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Execution Status Rates", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:547", + "format": "short", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:548", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": { + "p0": "dark-green", + "p1": "semi-dark-green", + "p100": "semi-dark-red", + "p25": "light-green", + "p50": "super-light-green", + "p75": "super-light-red", + "p99": "light-red", + "{percentile=\"p0\"}": "dark-green", + "{percentile=\"p1\"}": "semi-dark-green", + "{percentile=\"p100\"}": "dark-red", + "{percentile=\"p25\"}": "light-green", + "{percentile=\"p50\"}": "super-light-green", + "{percentile=\"p75\"}": "light-red", + "{percentile=\"p99\"}": "semi-dark-red" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "description": "", + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 13 + }, + "hiddenSeries": false, + "id": 34, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.4.7", + "pointradius": 1, + "points": true, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "avg by (percentile) (execution_latency_seconds)", + "interval": "", + "legendFormat": "{{percentile}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Execution Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:61", + "format": "s", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:62", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "unit": "percentunit" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 8, + "x": 0, + "y": 25 + }, + "hiddenSeries": false, + "id": 30, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.2.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "avg by (neuroncore) (neuroncore_utilization_ratio)", + "interval": "", + "legendFormat": "nc{{neuroncore}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "NeuronCore Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:493", + "format": "percentunit", + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:494", + "format": "short", + "logBase": 1, + "max": "100", + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": { + "Runtime system CPU usage ": "light-red", + "Runtime user CPU usage ": "light-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 8, + "x": 8, + "y": 25 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.2.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "avg by (usage_type) (neuron_runtime_vcpu_usage_ratio)", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Neuron Runtime {{usage_type}} CPU usage ", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Neuron Runtime vCPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:385", + "format": "percentunit", + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:386", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": { + "host": "rgb(0, 217, 255)", + "neuron_device": "super-light-orange" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "unit": "bytes" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 8, + "x": 16, + "y": 25 + }, + "hiddenSeries": false, + "id": 28, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.2.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "avg by (memory_location) (sum by (instance_id, memory_location) (neuron_runtime_memory_used_bytes))", + "interval": "", + "legendFormat": "{{memory_location}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Neuron Runtime Used Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:439", + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:440", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": { + "Memory Usage": "rgb(0, 217, 255)", + "NeuronCore Usage": "light-orange", + "vCPU Usage": "light-blue" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "unit": "percentunit" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 8, + "x": 0, + "y": 37 + }, + "hiddenSeries": false, + "id": 22, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.2.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "avg(system_memory_used_bytes / system_memory_total_bytes)", + "instant": false, + "interval": "", + "legendFormat": "Memory Usage", + "refId": "A" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "avg(sum by (instance_id) (system_vcpu_usage_ratio))", + "instant": false, + "interval": "", + "legendFormat": "vCPU Usage", + "refId": "B" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "avg(neuroncore_utilization_ratio)", + "instant": false, + "interval": "", + "legendFormat": "NeuronCore Usage", + "refId": "C" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Host System Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:664", + "format": "percentunit", + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:665", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": { + "system": "light-red", + "user": "light-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "unit": "percentunit" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 8, + "x": 8, + "y": 37 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.2.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "avg by (usage_type) (system_vcpu_usage_ratio)", + "interval": "", + "legendFormat": "{{usage_type}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Host vCPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:876", + "format": "percentunit", + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:877", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": { + "Memory Usage Bytes": "rgb(223, 180, 0)", + "Memory Usage Percent": "rgb(0, 217, 255)" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Memory Usage Percent" + }, + "properties": [ + { + "id": "unit", + "value": "percentunit" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Memory Usage Bytes" + }, + "properties": [ + { + "id": "unit", + "value": "bytes" + } + ] + } + ] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 8, + "x": 16, + "y": 37 + }, + "hiddenSeries": false, + "id": 26, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.2.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:711" + }, + { + "$$hashKey": "object:931", + "alias": "Memory Usage Bytes", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "expr": "avg(system_memory_used_bytes / system_memory_total_bytes)", + "instant": false, + "interval": "", + "legendFormat": "Memory Usage Percent", + "refId": "A" + }, + { + "datasource": { + "uid": "$datasource" + }, + "expr": "avg(system_memory_used_bytes)", + "instant": false, + "interval": "", + "legendFormat": "Memory Usage Bytes", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Host Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:689", + "format": "percentunit", + "label": "", + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:690", + "format": "bytes", + "label": "", + "logBase": 1, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false + } + } + ], + "refresh": "5s", + "revision": 1, + "schemaVersion": 38, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "hide": 0, + "includeAll": false, + "label": "Data Source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "filters": [], + "hide": 0, + "label": "", + "name": "Filters", + "skipUrlSync": false, + "type": "adhoc" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Neuron / Monitor", + "uid": "EqWNYf5Mz", + "version": 14, + "weekStart": "" + } + \ No newline at end of file diff --git a/artifacts/grafana-operator-manifests/eks/neuron/amg_grafana-dashboards.yaml b/artifacts/grafana-operator-manifests/eks/neuron/amg_grafana-dashboards.yaml new file mode 100644 index 0000000..1a18209 --- /dev/null +++ b/artifacts/grafana-operator-manifests/eks/neuron/amg_grafana-dashboards.yaml @@ -0,0 +1,11 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: neuron-monitor-grafanadashboard + namespace: grafana-operator +spec: + folder: "Observability Accelerator Dashboards" + instanceSelector: + matchLabels: + dashboards: "external-grafana" + url: ${GRAFANA_NEURON_DASH_URL} diff --git a/artifacts/grafana-operator-manifests/eks/neuron/kustomization.yaml b/artifacts/grafana-operator-manifests/eks/neuron/kustomization.yaml new file mode 100644 index 0000000..85a7882 --- /dev/null +++ b/artifacts/grafana-operator-manifests/eks/neuron/kustomization.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - amg_grafana-dashboards.yaml From cfe1a3b298b27b4360ef7938a226e39f324570e4 Mon Sep 17 00:00:00 2001 From: freschri <117075521+freschri@users.noreply.github.com> Date: Mon, 26 Feb 2024 09:24:03 +0100 Subject: [PATCH 4/4] removed new line at end of neuron-monitor.json --- artifacts/grafana-dashboards/eks/neuron/neuron-monitor.json | 1 - 1 file changed, 1 deletion(-) diff --git a/artifacts/grafana-dashboards/eks/neuron/neuron-monitor.json b/artifacts/grafana-dashboards/eks/neuron/neuron-monitor.json index fc2d915..6e412b5 100644 --- a/artifacts/grafana-dashboards/eks/neuron/neuron-monitor.json +++ b/artifacts/grafana-dashboards/eks/neuron/neuron-monitor.json @@ -1604,4 +1604,3 @@ "version": 14, "weekStart": "" } - \ No newline at end of file