diff --git a/dashboards/gpu-assessment-dashboard.json b/dashboards/gpu-assessment-dashboard.json index d24d804..75fa3b2 100644 --- a/dashboards/gpu-assessment-dashboard.json +++ b/dashboards/gpu-assessment-dashboard.json @@ -754,6 +754,220 @@ "title": "Running GPU Pods", "type": "row" }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 17, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Temperature (\u00b0C)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": true, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 80 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 2 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "avg(DCGM_FI_DEV_GPU_TEMP{modelName=~\"$gpu_type\"})", + "legendFormat": "{{modelName}}", + "refId": "A" + } + ], + "title": "GPU Temperature (Avg)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Max Temperature (\u00b0C)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": true, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 80 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 2 + }, + "id": 19, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "max(DCGM_FI_DEV_GPU_TEMP{modelName=~\"$gpu_type\"})", + "legendFormat": "{{modelName}}", + "refId": "A" + } + ], + "title": "GPU Max Temperature", + "type": "timeseries" + } + ], + "title": "GPU Temperature", + "type": "row" + }, { "datasource": { "type": "prometheus", @@ -1273,10 +1487,8 @@ { "allowCustomValue": false, "current": { - "text": [ - ], - "value": [ - ] + "text": [], + "value": [] }, "definition": "label_values(DCGM_FI_DEV_FB_FREE,modelName)", "label": "GPU Type",