Skip to content

Commit a7670b6

Browse files
authored
[Example] Add a simple NVML example (#1783)
* [Example] Add a simple NVML example * Clean up error handling * Better separation
1 parent 3b3f39b commit a7670b6

File tree

1 file changed

+241
-0
lines changed

1 file changed

+241
-0
lines changed
Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
# Copyright 2026 NVIDIA Corporation. All rights reserved.
2+
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
3+
4+
5+
# ################################################################################
6+
#
7+
# This example demonstrates the core cuda.bindings.nvml functionality by
8+
# implementing a subset of the NVIDIA System Management Interface (nvidia-smi)
9+
# command line tool in Python.
10+
#
11+
# ################################################################################
12+
13+
14+
import sys
15+
16+
from cuda.bindings import nvml
17+
18+
##################################################################################
19+
# FORMATTING HELPERS
20+
21+
# Utilities to help format the output table. See below for NVML usage.
22+
23+
24+
def format_size(bytes_val: int) -> str:
25+
"""Formats bytes to MiB."""
26+
return f"{bytes_val / (1024 * 1024):.0f}MiB"
27+
28+
29+
LINES = [[[4, 27, 6], [18, 3], [20]], [[4, 6, 13, 13], [22], [9, 10]]]
30+
31+
32+
class TableFormatter:
33+
def __init__(self, lines):
34+
self.formats, self.sizes, self.counts = zip(*[self._create_line_format(line) for line in lines])
35+
36+
def _create_line_format(self, descriptor):
37+
parts = []
38+
sizes = []
39+
for section in descriptor:
40+
parts.append("| ")
41+
sizes.append(1)
42+
for i, align in enumerate(section):
43+
if i == len(section) - 1:
44+
direct = ">"
45+
else:
46+
direct = "<"
47+
parts.append(f"{{:{direct}{align}}} ")
48+
sizes[-1] += align + 1
49+
parts.append("|")
50+
return "".join(parts), sizes, sum(len(x) for x in descriptor)
51+
52+
def print_line(self, char="-"):
53+
parts = ["+"]
54+
for size in self.sizes[0]:
55+
parts.append(char * size)
56+
parts.append("+")
57+
print("".join(parts))
58+
59+
def print_values(self, *args):
60+
for line_format, count in zip(self.formats, self.counts):
61+
print(line_format.format(*args[:count]))
62+
args = args[count:]
63+
64+
65+
def print_table(metadata, devices):
66+
formatter = TableFormatter(LINES)
67+
68+
print("+-----------------------------------------------------------------------------------------+")
69+
print(
70+
f"| NVIDIA-MINI-SMI {metadata['driver_version']:<16} Driver Version: {metadata['driver_version']:<15} CUDA Version: {metadata['cuda_version']:<9}|"
71+
)
72+
formatter.print_line()
73+
print("| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |")
74+
print("| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |")
75+
formatter.print_line("=")
76+
77+
for device in devices:
78+
formatter.print_values(
79+
str(device["index"]),
80+
device["name"],
81+
device["persistence"],
82+
device["bus_id"],
83+
device["display_active"],
84+
device["ecc_mode"],
85+
device["fan_speed"],
86+
device["temperature"],
87+
device["performance_state"],
88+
device["power"],
89+
device["memory"],
90+
device["utilization"],
91+
device["compute_mode"],
92+
)
93+
formatter.print_line()
94+
95+
96+
##################################################################################
97+
# NVML USAGE EXAMPLES
98+
99+
100+
def collect_info():
101+
metadata = {}
102+
metadata["driver_version"] = nvml.system_get_driver_version()
103+
cuda_version_int = nvml.system_get_cuda_driver_version()
104+
cuda_major = cuda_version_int // 1000
105+
cuda_minor = (cuda_version_int % 1000) // 10
106+
metadata["cuda_version"] = f"{cuda_major}.{cuda_minor}"
107+
108+
devices = []
109+
110+
device_count = nvml.device_get_count_v2()
111+
112+
for i in range(device_count):
113+
device = {}
114+
device["index"] = i
115+
116+
handle = nvml.device_get_handle_by_index_v2(i)
117+
118+
name = nvml.device_get_name(handle)
119+
device["name"] = name
120+
121+
try:
122+
persistence = nvml.device_get_persistence_mode(handle)
123+
persistence_str = "On" if persistence == nvml.EnableState.FEATURE_ENABLED else "Off"
124+
except nvml.NvmlError:
125+
persistence_str = "N/A"
126+
device["persistence"] = persistence_str
127+
128+
try:
129+
pci_info = nvml.device_get_pci_info_v3(handle)
130+
bus_id = pci_info.bus_id
131+
except nvml.NvmlError:
132+
bus_id = "N/A"
133+
device["bus_id"] = bus_id
134+
135+
try:
136+
display_active = nvml.device_get_display_active(handle)
137+
disp_str = "On" if display_active == nvml.EnableState.FEATURE_ENABLED else "Off"
138+
except nvml.NvmlError:
139+
disp_str = "N/A"
140+
device["display_active"] = disp_str
141+
142+
try:
143+
current, _ = nvml.device_get_ecc_mode(handle)
144+
ecc_str = "On" if current == nvml.EnableState.FEATURE_ENABLED else "Off"
145+
except nvml.NvmlError:
146+
ecc_str = "N/A"
147+
device["ecc_mode"] = ecc_str
148+
149+
try:
150+
fan = nvml.device_get_fan_speed(handle)
151+
fan_str = f"{fan: >3}%"
152+
except nvml.NvmlError:
153+
fan_str = "N/A"
154+
device["fan_speed"] = fan_str
155+
156+
try:
157+
temp = nvml.device_get_temperature_v(handle, nvml.TemperatureSensors.TEMPERATURE_GPU)
158+
temp_str = f"{temp}C"
159+
except nvml.NvmlError:
160+
temp_str = "N/A"
161+
device["temperature"] = temp_str
162+
163+
try:
164+
perf_state = nvml.device_get_performance_state(handle)
165+
perf_str = f"P{perf_state}"
166+
except nvml.NvmlError:
167+
perf_str = "N/A"
168+
device["performance_state"] = perf_str
169+
170+
try:
171+
power_usage = nvml.device_get_power_usage(handle) # mW
172+
usage_str = f"{power_usage // 1000}W"
173+
except nvml.NvmlError:
174+
usage_str = "N/A"
175+
176+
try:
177+
power_cap = nvml.device_get_power_management_limit(handle) # mW
178+
cap_str = f"{power_cap // 1000}W"
179+
except nvml.NvmlError:
180+
cap_str = "N/A"
181+
182+
pwr_str = f"{usage_str} / {cap_str}"
183+
device["power"] = pwr_str
184+
185+
try:
186+
mem_info = nvml.device_get_memory_info_v2(handle)
187+
except nvml.NvmlError:
188+
mem_str = "N/A"
189+
else:
190+
mem_used = format_size(mem_info.used)
191+
mem_total = format_size(mem_info.total)
192+
mem_str = f"{mem_used} / {mem_total}"
193+
194+
device["memory"] = mem_str
195+
196+
try:
197+
util_rates = nvml.device_get_utilization_rates(handle)
198+
except nvml.NvmlError:
199+
util_str = "N/A"
200+
else:
201+
gpu_util = util_rates.gpu
202+
util_str = f"{gpu_util: >3}%"
203+
204+
device["utilization"] = util_str
205+
206+
try:
207+
compute_mode = nvml.device_get_compute_mode(handle)
208+
except nvml.NvmlError:
209+
comp_str = "N/A"
210+
else:
211+
if compute_mode == nvml.ComputeMode.COMPUTEMODE_DEFAULT:
212+
comp_str = "Default"
213+
elif compute_mode == nvml.ComputeMode.COMPUTEMODE_EXCLUSIVE_PROCESS:
214+
comp_str = "E. Process"
215+
elif compute_mode == nvml.ComputeMode.COMPUTEMODE_PROHIBITED:
216+
comp_str = "Prohibited"
217+
else:
218+
comp_str = "Unknown"
219+
device["compute_mode"] = comp_str
220+
221+
devices.append(device)
222+
223+
return metadata, devices
224+
225+
226+
def main():
227+
try:
228+
nvml.init_v2()
229+
except nvml.NvmlError as e:
230+
print(f"Failed to initialize NVML: {e}")
231+
sys.exit(1)
232+
233+
try:
234+
metadata, devices = collect_info()
235+
print_table(metadata, devices)
236+
finally:
237+
nvml.shutdown()
238+
239+
240+
if __name__ == "__main__":
241+
main()

0 commit comments

Comments
 (0)