|
1 | 1 | { |
2 | 2 | "author_name": "Datadog", |
3 | | - "description": "**DCGM Exporter**\n\nThe DCGM exporter is a tool to connect with Nvidia DCGM and exposes GPU telemetry in containerized environments. This dashboard gives you GPU metrics so you can monitor your GPU's health and performance effectively. \n\n\n**Useful Links**\n\n* [Blog Post](https://www.datadoghq.com/blog/monitor-nvidia-gpus-with-datadog/)\n* [List of GPUs supported](https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/getting-started.html)\n", |
| 3 | + "description": "**DCGM Exporter**\n\nThe DCGM exporter is a tool to connect with Nvidia DCGM and exposes GPU telemetry in containerized environments. This dashboard gives you GPU metrics so you can monitor your GPU's health and performance effectively. \n\n\n**Useful Links**\n\n* [Blog Post](https://www.datadoghq.com/blog/monitor-nvidia-gpus-with-datadog/)\n* [List of GPUs supported](https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/getting-started.html)", |
4 | 4 | "layout_type": "ordered", |
5 | 5 | "template_variables": [ |
6 | 6 | { |
|
16 | 16 | "prefix": "service" |
17 | 17 | } |
18 | 18 | ], |
19 | | - "title": "DCGM Exporter (Nvidia GPU Monitoring) Overview", |
| 19 | + "title": "LEGACY - DCGM Exporter (Nvidia GPU Monitoring) Overview", |
20 | 20 | "widgets": [ |
| 21 | + { |
| 22 | + "definition": { |
| 23 | + "background_color": "vivid_yellow", |
| 24 | + "layout_type": "ordered", |
| 25 | + "show_title": true, |
| 26 | + "title": "This is a legacy integration - Use Datadog's GPU Monitoring instead", |
| 27 | + "type": "group", |
| 28 | + "widgets": [ |
| 29 | + { |
| 30 | + "definition": { |
| 31 | + "background_color": "white", |
| 32 | + "content": "This integration is no longer fully supported. Instead of enabling this integration, use Datadog's GPU Monitoring (now generally available) instead. \n\nDatadog's GPU Monitoring includes all of this integration's metrics and more. The product also provides proactive detection of device anomalies and guided recommended actions for how to maximize GPU efficiency and minimize GPU spend. ", |
| 33 | + "font_size": "14", |
| 34 | + "has_padding": true, |
| 35 | + "show_tick": false, |
| 36 | + "text_align": "left", |
| 37 | + "tick_edge": "left", |
| 38 | + "tick_pos": "50%", |
| 39 | + "type": "note", |
| 40 | + "vertical_align": "top" |
| 41 | + }, |
| 42 | + "id": 3443863408063438, |
| 43 | + "layout": { |
| 44 | + "height": 2, |
| 45 | + "width": 6, |
| 46 | + "x": 0, |
| 47 | + "y": 0 |
| 48 | + } |
| 49 | + }, |
| 50 | + { |
| 51 | + "definition": { |
| 52 | + "background_color": "white", |
| 53 | + "content": "**Useful Links**\n\n* [What is Datadog's GPU Monitoring?](https://docs.datadoghq.com/gpu_monitoring/)\n* [Set up guide for GPU Monitoring](https://docs.datadoghq.com/gpu_monitoring/setup/?tab=datadogoperator)\n", |
| 54 | + "font_size": "16", |
| 55 | + "has_padding": true, |
| 56 | + "show_tick": false, |
| 57 | + "text_align": "left", |
| 58 | + "tick_edge": "left", |
| 59 | + "tick_pos": "50%", |
| 60 | + "type": "note", |
| 61 | + "vertical_align": "top" |
| 62 | + }, |
| 63 | + "id": 5013994176334527, |
| 64 | + "layout": { |
| 65 | + "height": 2, |
| 66 | + "width": 6, |
| 67 | + "x": 6, |
| 68 | + "y": 0 |
| 69 | + } |
| 70 | + } |
| 71 | + ] |
| 72 | + }, |
| 73 | + "id": 3549967635694783, |
| 74 | + "layout": { |
| 75 | + "height": 3, |
| 76 | + "width": 12, |
| 77 | + "x": 0, |
| 78 | + "y": 0 |
| 79 | + } |
| 80 | + }, |
21 | 81 | { |
22 | 82 | "definition": { |
23 | 83 | "banner_img": "/static/images/integration_dashboard/dcgm_hero_1.png", |
|
72 | 132 | }, |
73 | 133 | "id": 6955576996753366, |
74 | 134 | "layout": { |
75 | | - "height": 5, |
| 135 | + "height": 7, |
76 | 136 | "width": 5, |
77 | 137 | "x": 0, |
78 | | - "y": 0 |
| 138 | + "y": 3 |
79 | 139 | } |
80 | 140 | }, |
81 | 141 | { |
|
113 | 173 | "count": 50, |
114 | 174 | "display_format": "countsAndList", |
115 | 175 | "hide_zero_counts": true, |
| 176 | + "last_triggered_format": "relative", |
116 | 177 | "query": "tag:(integration:dcgm)", |
117 | 178 | "show_last_triggered": false, |
118 | 179 | "show_priority": false, |
| 180 | + "show_status": true, |
119 | 181 | "sort": "status,asc", |
120 | 182 | "start": 0, |
121 | 183 | "summary_type": "monitors", |
|
148 | 210 | ], |
149 | 211 | "formulas": [ |
150 | 212 | { |
151 | | - "formula": "query1", |
152 | | - "limit": { |
153 | | - "count": 10, |
154 | | - "order": "desc" |
155 | | - } |
| 213 | + "formula": "query1" |
156 | 214 | } |
157 | 215 | ], |
158 | 216 | "queries": [ |
|
163 | 221 | "query": "sum:dcgm.vgpu_license_status{*} by {dcgm_fi_driver_version}" |
164 | 222 | } |
165 | 223 | ], |
166 | | - "response_format": "scalar" |
| 224 | + "response_format": "scalar", |
| 225 | + "sort": { |
| 226 | + "count": 10, |
| 227 | + "order_by": [ |
| 228 | + { |
| 229 | + "index": 0, |
| 230 | + "order": "desc", |
| 231 | + "type": "formula" |
| 232 | + } |
| 233 | + ] |
| 234 | + } |
167 | 235 | } |
168 | 236 | ], |
169 | 237 | "title": "DCGM Driver Version", |
|
257 | 325 | "height": 5, |
258 | 326 | "width": 7, |
259 | 327 | "x": 5, |
260 | | - "y": 0 |
| 328 | + "y": 3 |
261 | 329 | } |
262 | 330 | }, |
263 | 331 | { |
|
336 | 404 | } |
337 | 405 | ], |
338 | 406 | "show_legend": true, |
339 | | - "time": {}, |
340 | 407 | "title": "GPU Temperature", |
341 | 408 | "title_align": "left", |
342 | 409 | "title_size": "16", |
|
357 | 424 | "height": 7, |
358 | 425 | "width": 12, |
359 | 426 | "x": 0, |
360 | | - "y": 5 |
| 427 | + "y": 10 |
361 | 428 | } |
362 | 429 | }, |
363 | 430 | { |
|
491 | 558 | "height": 5, |
492 | 559 | "width": 12, |
493 | 560 | "x": 0, |
494 | | - "y": 12 |
| 561 | + "y": 17 |
495 | 562 | } |
496 | 563 | }, |
497 | 564 | { |
|
617 | 684 | "is_column_break": true, |
618 | 685 | "width": 12, |
619 | 686 | "x": 0, |
620 | | - "y": 17 |
| 687 | + "y": 44 |
621 | 688 | } |
622 | 689 | }, |
623 | 690 | { |
|
658 | 725 | { |
659 | 726 | "formulas": [ |
660 | 727 | { |
661 | | - "formula": "query1 + query2", |
662 | | - "limit": { |
663 | | - "order": "desc" |
664 | | - } |
| 728 | + "formula": "query1 + query2" |
665 | 729 | } |
666 | 730 | ], |
667 | 731 | "queries": [ |
|
679 | 743 | } |
680 | 744 | ], |
681 | 745 | "response_format": "scalar", |
| 746 | + "sort": { |
| 747 | + "order_by": [ |
| 748 | + { |
| 749 | + "index": 0, |
| 750 | + "order": "desc", |
| 751 | + "type": "formula" |
| 752 | + } |
| 753 | + ] |
| 754 | + }, |
682 | 755 | "style": { |
683 | 756 | "palette": "datadog16" |
684 | 757 | } |
|
771 | 844 | ], |
772 | 845 | "formulas": [ |
773 | 846 | { |
774 | | - "formula": "query1", |
775 | | - "limit": { |
776 | | - "count": 500, |
777 | | - "order": "desc" |
778 | | - } |
| 847 | + "formula": "query1" |
779 | 848 | } |
780 | 849 | ], |
781 | 850 | "queries": [ |
|
786 | 855 | "query": "sum:dcgm.vgpu_license_status{*} by {dcgm_fi_process_name}" |
787 | 856 | } |
788 | 857 | ], |
789 | | - "response_format": "scalar" |
| 858 | + "response_format": "scalar", |
| 859 | + "sort": { |
| 860 | + "count": 500, |
| 861 | + "order_by": [ |
| 862 | + { |
| 863 | + "index": 0, |
| 864 | + "order": "desc", |
| 865 | + "type": "formula" |
| 866 | + } |
| 867 | + ] |
| 868 | + } |
790 | 869 | } |
791 | 870 | ], |
792 | 871 | "style": {}, |
|
865 | 944 | "height": 9, |
866 | 945 | "width": 12, |
867 | 946 | "x": 0, |
868 | | - "y": 23 |
| 947 | + "y": 50 |
869 | 948 | } |
870 | 949 | } |
871 | 950 | ] |
|
0 commit comments