Skip to content

Commit 527143e

Browse files
mahipdeora25claude
andauthored
[dcgm] Update OOTB overview dashboard (DataDog#23413)
* [dcgm] Update OOTB overview dashboard Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * Remove "cloned" references from dashboard title and description Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent aa1ff9f commit 527143e

2 files changed

Lines changed: 106 additions & 27 deletions

File tree

dcgm/assets/dashboards/dcgm_overview.json

Lines changed: 105 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"author_name": "Datadog",
3-
"description": "**DCGM Exporter**\n\nThe DCGM exporter is a tool to connect with Nvidia DCGM and exposes GPU telemetry in containerized environments. This dashboard gives you GPU metrics so you can monitor your GPU's health and performance effectively. \n\n\n**Useful Links**\n\n* [Blog Post](https://www.datadoghq.com/blog/monitor-nvidia-gpus-with-datadog/)\n* [List of GPUs supported](https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/getting-started.html)\n",
3+
"description": "**DCGM Exporter**\n\nThe DCGM exporter is a tool to connect with Nvidia DCGM and exposes GPU telemetry in containerized environments. This dashboard gives you GPU metrics so you can monitor your GPU's health and performance effectively. \n\n\n**Useful Links**\n\n* [Blog Post](https://www.datadoghq.com/blog/monitor-nvidia-gpus-with-datadog/)\n* [List of GPUs supported](https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/getting-started.html)",
44
"layout_type": "ordered",
55
"template_variables": [
66
{
@@ -16,8 +16,68 @@
1616
"prefix": "service"
1717
}
1818
],
19-
"title": "DCGM Exporter (Nvidia GPU Monitoring) Overview",
19+
"title": "LEGACY - DCGM Exporter (Nvidia GPU Monitoring) Overview",
2020
"widgets": [
21+
{
22+
"definition": {
23+
"background_color": "vivid_yellow",
24+
"layout_type": "ordered",
25+
"show_title": true,
26+
"title": "This is a legacy integration - Use Datadog's GPU Monitoring instead",
27+
"type": "group",
28+
"widgets": [
29+
{
30+
"definition": {
31+
"background_color": "white",
32+
"content": "This integration is no longer fully supported. Instead of enabling this integration, use Datadog's GPU Monitoring (now generally available) instead. \n\nDatadog's GPU Monitoring includes all of this integration's metrics and more. The product also provides proactive detection of device anomalies and guided recommended actions for how to maximize GPU efficiency and minimize GPU spend. ",
33+
"font_size": "14",
34+
"has_padding": true,
35+
"show_tick": false,
36+
"text_align": "left",
37+
"tick_edge": "left",
38+
"tick_pos": "50%",
39+
"type": "note",
40+
"vertical_align": "top"
41+
},
42+
"id": 3443863408063438,
43+
"layout": {
44+
"height": 2,
45+
"width": 6,
46+
"x": 0,
47+
"y": 0
48+
}
49+
},
50+
{
51+
"definition": {
52+
"background_color": "white",
53+
"content": "**Useful Links**\n\n* [What is Datadog's GPU Monitoring?](https://docs.datadoghq.com/gpu_monitoring/)\n* [Set up guide for GPU Monitoring](https://docs.datadoghq.com/gpu_monitoring/setup/?tab=datadogoperator)\n",
54+
"font_size": "16",
55+
"has_padding": true,
56+
"show_tick": false,
57+
"text_align": "left",
58+
"tick_edge": "left",
59+
"tick_pos": "50%",
60+
"type": "note",
61+
"vertical_align": "top"
62+
},
63+
"id": 5013994176334527,
64+
"layout": {
65+
"height": 2,
66+
"width": 6,
67+
"x": 6,
68+
"y": 0
69+
}
70+
}
71+
]
72+
},
73+
"id": 3549967635694783,
74+
"layout": {
75+
"height": 3,
76+
"width": 12,
77+
"x": 0,
78+
"y": 0
79+
}
80+
},
2181
{
2282
"definition": {
2383
"banner_img": "/static/images/integration_dashboard/dcgm_hero_1.png",
@@ -72,10 +132,10 @@
72132
},
73133
"id": 6955576996753366,
74134
"layout": {
75-
"height": 5,
135+
"height": 7,
76136
"width": 5,
77137
"x": 0,
78-
"y": 0
138+
"y": 3
79139
}
80140
},
81141
{
@@ -113,9 +173,11 @@
113173
"count": 50,
114174
"display_format": "countsAndList",
115175
"hide_zero_counts": true,
176+
"last_triggered_format": "relative",
116177
"query": "tag:(integration:dcgm)",
117178
"show_last_triggered": false,
118179
"show_priority": false,
180+
"show_status": true,
119181
"sort": "status,asc",
120182
"start": 0,
121183
"summary_type": "monitors",
@@ -148,11 +210,7 @@
148210
],
149211
"formulas": [
150212
{
151-
"formula": "query1",
152-
"limit": {
153-
"count": 10,
154-
"order": "desc"
155-
}
213+
"formula": "query1"
156214
}
157215
],
158216
"queries": [
@@ -163,7 +221,17 @@
163221
"query": "sum:dcgm.vgpu_license_status{*} by {dcgm_fi_driver_version}"
164222
}
165223
],
166-
"response_format": "scalar"
224+
"response_format": "scalar",
225+
"sort": {
226+
"count": 10,
227+
"order_by": [
228+
{
229+
"index": 0,
230+
"order": "desc",
231+
"type": "formula"
232+
}
233+
]
234+
}
167235
}
168236
],
169237
"title": "DCGM Driver Version",
@@ -257,7 +325,7 @@
257325
"height": 5,
258326
"width": 7,
259327
"x": 5,
260-
"y": 0
328+
"y": 3
261329
}
262330
},
263331
{
@@ -336,7 +404,6 @@
336404
}
337405
],
338406
"show_legend": true,
339-
"time": {},
340407
"title": "GPU Temperature",
341408
"title_align": "left",
342409
"title_size": "16",
@@ -357,7 +424,7 @@
357424
"height": 7,
358425
"width": 12,
359426
"x": 0,
360-
"y": 5
427+
"y": 10
361428
}
362429
},
363430
{
@@ -491,7 +558,7 @@
491558
"height": 5,
492559
"width": 12,
493560
"x": 0,
494-
"y": 12
561+
"y": 17
495562
}
496563
},
497564
{
@@ -617,7 +684,7 @@
617684
"is_column_break": true,
618685
"width": 12,
619686
"x": 0,
620-
"y": 17
687+
"y": 44
621688
}
622689
},
623690
{
@@ -658,10 +725,7 @@
658725
{
659726
"formulas": [
660727
{
661-
"formula": "query1 + query2",
662-
"limit": {
663-
"order": "desc"
664-
}
728+
"formula": "query1 + query2"
665729
}
666730
],
667731
"queries": [
@@ -679,6 +743,15 @@
679743
}
680744
],
681745
"response_format": "scalar",
746+
"sort": {
747+
"order_by": [
748+
{
749+
"index": 0,
750+
"order": "desc",
751+
"type": "formula"
752+
}
753+
]
754+
},
682755
"style": {
683756
"palette": "datadog16"
684757
}
@@ -771,11 +844,7 @@
771844
],
772845
"formulas": [
773846
{
774-
"formula": "query1",
775-
"limit": {
776-
"count": 500,
777-
"order": "desc"
778-
}
847+
"formula": "query1"
779848
}
780849
],
781850
"queries": [
@@ -786,7 +855,17 @@
786855
"query": "sum:dcgm.vgpu_license_status{*} by {dcgm_fi_process_name}"
787856
}
788857
],
789-
"response_format": "scalar"
858+
"response_format": "scalar",
859+
"sort": {
860+
"count": 500,
861+
"order_by": [
862+
{
863+
"index": 0,
864+
"order": "desc",
865+
"type": "formula"
866+
}
867+
]
868+
}
790869
}
791870
],
792871
"style": {},
@@ -865,7 +944,7 @@
865944
"height": 9,
866945
"width": 12,
867946
"x": 0,
868-
"y": 23
947+
"y": 50
869948
}
870949
}
871950
]

dcgm/manifest.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,4 +58,4 @@
5858
"homepage": "https://www.datadoghq.com",
5959
"sales_email": "info@datadoghq.com"
6060
}
61-
}
61+
}

0 commit comments

Comments
 (0)