From 22c4c4c81e40530229c658cfbb2ee4f939ad3974 Mon Sep 17 00:00:00 2001 From: hanfinetree <109786134+hanfinetree@users.noreply.github.com> Date: Mon, 11 Mar 2024 14:23:31 +0900 Subject: [PATCH] Create 1_overview.json --- Monitoring/Grafana/1_overview.json | 2335 ++++++++++++++++++++++++++++ 1 file changed, 2335 insertions(+) create mode 100644 Monitoring/Grafana/1_overview.json diff --git a/Monitoring/Grafana/1_overview.json b/Monitoring/Grafana/1_overview.json new file mode 100644 index 0000000..a85a062 --- /dev/null +++ b/Monitoring/Grafana/1_overview.json @@ -0,0 +1,2335 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "limit": 100, + "name": "Annotations & Alerts", + "showIn": 0, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 5, + "links": [ + { + "asDropdown": true, + "icon": "external link", + "tags": [], + "targetBlank": true, + "type": "dashboards" + } + ], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "bgColor": "rgb(18, 18, 18)", + "clockType": "12 hour", + "countdownSettings": { + "endCountdownTime": "2021-01-25T19:37:48+09:00", + "endText": "00:00:00" + }, + "countupSettings": { + "beginCountupTime": "2024-03-11T19:55:48+09:00", + "beginText": "00:00:00" + }, + "dateSettings": { + "dateFormat": "YYYY-MM-DD", + "fontSize": "30px", + "fontWeight": "normal", + "locale": "", + "showDate": true + }, + "fontMono": false, + "mode": "time", + "refresh": "sec", + "timeSettings": { + "fontSize": "30px", + "fontWeight": "normal" + }, + "timezone": "Asia/Seoul", + "timezoneSettings": { + "fontSize": "15px", + "fontWeight": "normal", + "showTimezone": true, + "zoneFormat": "nameOffset" + } + }, + "pluginVersion": "2.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "refId": "A" + } + ], + "title": "현재 시간", + "type": "grafana-clock-panel" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-BlYlRd" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 3, + "x": 4, + "y": 0 + }, + "id": 17, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "editorMode": "code", + "exemplar": true, + "expr": "ipmi_temperature_celsius{id=\"145\"}", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Peripheral Temp", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "온도 (master server)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 70, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "mappings": [], + "max": 50, + "min": 10, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "orange", + "value": 40 + } + ] + }, + "unit": "celsius" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Peripheral Temp" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Temp" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 7, + "y": 0 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "editorMode": "code", + "exemplar": true, + "expr": "ipmi_temperature_celsius{id=\"145\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 3, + "legendFormat": "Temp", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "온도 변화 (master server)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 12, + "y": 0 + }, + "id": 7, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": " - test.", + "mode": "markdown" + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "refId": "A" + } + ], + "title": "공지사항", + "type": "text" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 17, + "y": 0 + }, + "id": 4, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "##### === 관리자 ===\n\n##### === 유지보수 업체 ===\n(주)다산데이타 \n02-871-9932 \ndasan_as@dasandata.co.kr ", + "mode": "markdown" + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "refId": "A" + } + ], + "title": "연락처", + "type": "text" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "gridPos": { + "h": 6, + "w": 3, + "x": 21, + "y": 0 + }, + "id": 8, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "[구글](http://google.com)\n\n[User Guide for OpenHPC Cluster](https://github.com/dasandata/Open_HPC/tree/master/Document/User%20Guide)\n\n", + "mode": "markdown" + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "refId": "A" + } + ], + "title": "Link", + "type": "text" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "cpu core", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "login1" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "master" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 3, + "x": 0, + "y": 6 + }, + "id": 23, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "editorMode": "code", + "exemplar": true, + "expr": "node_load1{instance=\"10.1.1.200:9100\"}", + "interval": "", + "legendFormat": "master cpu (96)", + "range": true, + "refId": "A" + } + ], + "title": "master node CPU Load", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "/datasets" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "/home" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 3, + "x": 3, + "y": 6 + }, + "id": 25, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "editorMode": "code", + "expr": "100 - ((node_filesystem_avail_bytes{instance=~\"10.1.1.200:9100\",job=~\"$job\",device!~'rootfs',device !~'tmpfs',device!~'by-uuid',mountpoint!~\"/boot\",mountpoint!~\"/boot/efi\"} * 100) / node_filesystem_size_bytes{instance=~\"10.1.1.200:9100\",job=~\"$job\",device!~'rootfs',device !~'tmpfs',device!~'by-uuid',mountpoint!~\"/boot\",mountpoint!~\"/boot/efi\"})", + "interval": "", + "legendFormat": "{{mountpoint}}", + "range": true, + "refId": "A" + } + ], + "title": "master Disk Space Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 50, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 0, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 5, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Allocated" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Allocated (including compl)" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Allocated Nodes (including compl)" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Down,Drain" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Idle" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Idle Nodes" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Maint, Reserv" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-purple", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Maint, Resv" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-purple", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Maint, Resv Nodes" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "semi-dark-purple", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Mainternace, Reserved" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Mixed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Mixed Nodes" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "rgb(28, 28, 28)", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total Nodes" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "rgb(0, 0, 0)", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Allocated Nodes (including compl)" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "hidden" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 9, + "x": 6, + "y": 6 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "expr": "slurm_nodes_down + slurm_nodes_drain", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Down,Drain", + "refId": "F" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "expr": "slurm_nodes_maint + slurm_nodes_resv", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Maint, Reserv", + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "expr": "slurm_nodes_alloc + slurm_nodes_comp", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Allocated", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "expr": "slurm_nodes_mix", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Mixed", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "expr": "slurm_nodes_idle", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Idle", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "expr": "slurm_nodes_alloc + slurm_nodes_down + slurm_nodes_drain + slurm_nodes_idle + slurm_nodes_mix + slurm_nodes_comp + slurm_nodes_maint + slurm_nodes_resv", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Total", + "refId": "D" + } + ], + "title": "Nodes Status (Total 1)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + }, + { + "color": "dark-red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 2, + "x": 15, + "y": 6 + }, + "id": 12, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": false, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "expr": "slurm_nodes_down", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Down", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "expr": "slurm_nodes_drain", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Drain", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "expr": "slurm_nodes_err != 0", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Nodes in *error* state", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "expr": "slurm_nodes_fail != 0", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Nodes in *fail* state", + "refId": "D" + } + ], + "title": "Fail/Down/Drain/Err Nodes", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 50, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 0, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "max": 5, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Pending Jobs" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "semi-dark-orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Running Jobs" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 17, + "y": 6 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "expr": "slurm_queue_completing != 0", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Completing Jobs", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "expr": "slurm_queue_running", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Running Jobs", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "expr": "slurm_queue_pending", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Pending Jobs", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "expr": "slurm_queue_completed != 0", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Completed Jobs", + "refId": "D" + } + ], + "title": "RUNNING/PEND Jobs", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "displayName": ".", + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "noValue": ".", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#299c46", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "#d44a3a", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 17, + "x": 0, + "y": 14 + }, + "id": 31, + "maxDataPoints": 100, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": false, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"10.1.1.*:9400\"}", + "format": "time_series", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": ".", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "hide": false, + "refId": "A" + } + ], + "title": "실시간 GPU 사용량 (All nodes)", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "/datasets" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "/home" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "GPU Utilization" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 7, + "x": 17, + "y": 14 + }, + "id": 32, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "expr": "avg(DCGM_FI_DEV_GPU_UTIL{instance=~\"10.1.1.*:9400\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "GPU Utilization", + "refId": "A" + } + ], + "title": "gpu 사용률 (All nodes)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "displayName": "-", + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "noValue": "-", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-blue", + "value": null + }, + { + "color": "light-blue", + "value": 40 + }, + { + "color": "#EAB839", + "value": 60 + }, + { + "color": "#d44a3a", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 17, + "x": 0, + "y": 20 + }, + "id": 26, + "maxDataPoints": 100, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": false, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "exemplar": true, + "expr": "(1 - avg(rate(node_cpu_seconds_total{job=~\"$job\",mode=\"idle\",note=\"compute-node\"}[1m])) by (instance)) * 100", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "실시간 CPU 사용량 (All nodes)", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "/datasets" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "/home" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "CPU Busy Avg" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "CPU Busy Avg (All nodes)" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 5, + "w": 7, + "x": 17, + "y": 20 + }, + "id": 28, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "editorMode": "code", + "expr": "100 - (avg(rate(node_cpu_seconds_total{instance=~\"10.1.1.*:9100\",note=\"compute-node\",mode=\"idle\"}[1m])) * 100)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "CPU Busy Avg", + "range": true, + "refId": "A" + } + ], + "title": "CPU 사용률 (All nodes)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "displayName": "-", + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "noValue": "-", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-purple", + "value": null + }, + { + "color": "super-light-purple", + "value": 40 + }, + { + "color": "light-orange", + "value": 60 + }, + { + "color": "#d44a3a", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 17, + "x": 0, + "y": 25 + }, + "id": 29, + "maxDataPoints": 100, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": false, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "expr": "(1 - (node_memory_MemAvailable_bytes{instance=~\"10.1.1.*:9100\",note=\"compute-node\"} / (node_memory_MemTotal_bytes{instance=~\"10.1.1.*:9100\",note=\"compute-node\"})))* 100", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "실시간 Memory 사용량 (All nodes)", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "/datasets" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "/home" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Used Memory" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-purple", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Used RAM (All nodes)" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-purple", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "avg(1 - (node_memory_MemAvailable_bytes{instance=~\"10.1.1.*:9100\",note=\"compute-node\"} / (node_memory_MemTotal_bytes{instance=~\"10.1.1.*:9100\",note=\"compute-node\"})))* 100" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-purple", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 5, + "w": 7, + "x": 17, + "y": 25 + }, + "id": 30, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "expr": "avg(1 - (node_memory_MemAvailable_bytes{instance=~\"10.1.1.*:9100\",note=\"compute-node\"} / (node_memory_MemTotal_bytes{instance=~\"10.1.1.*:9100\",note=\"compute-node\"})))* 100", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Used Memory", + "refId": "A" + } + ], + "title": "Memory 사용률 (All nodes)", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "node-exporter", + "value": "node-exporter" + }, + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "definition": "label_values(node_uname_info,job)", + "hide": 2, + "includeAll": false, + "label": "Job", + "multi": false, + "name": "job", + "options": [], + "query": { + "query": "label_values(node_uname_info,job)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "n1", + "value": "n1" + }, + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "definition": "label_values(node_uname_info{job=~\"$job\"}, nodename)", + "hide": 2, + "includeAll": false, + "label": "Host:", + "multi": false, + "name": "name", + "options": [], + "query": { + "query": "label_values(node_uname_info{job=~\"$job\"}, nodename)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "10.1.1.1", + "value": "10.1.1.1" + }, + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "definition": "label_values(node_uname_info{nodename=\"$name\"}, instance)", + "hide": 2, + "includeAll": false, + "label": "Host:", + "multi": false, + "name": "node", + "options": [], + "query": { + "query": "label_values(node_uname_info{nodename=\"$name\"}, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "/([^:]+):.*/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "9100", + "value": "9100" + }, + "datasource": { + "type": "prometheus", + "uid": "ddfb0sptuu6tce" + }, + "definition": "label_values(node_uname_info{instance=~\"$node:(.*)\"}, instance)", + "hide": 2, + "includeAll": false, + "label": "Port", + "multi": false, + "name": "port", + "options": [], + "query": { + "query": "label_values(node_uname_info{instance=~\"$node:(.*)\"}, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "/[^:]+:(.*)/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "1) Overview", + "uid": "1KnADkqMk", + "version": 7, + "weekStart": "" +}