Skip to content

Commit 012740a

Browse files
committed
Convert rocm-smi to amd-smi
Signed-off-by: Adam360x <Adam.pryor@amd.com>
1 parent 668b9d1 commit 012740a

9 files changed

Lines changed: 86 additions & 71 deletions

File tree

.github/scripts/fbgemm_gpu_test.bash

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ __configure_fbgemm_gpu_test_rocm () {
152152

153153
# AMD GPUs need to be explicitly made visible to PyTorch for use
154154
# shellcheck disable=SC2155,SC2126
155-
local num_gpus=$(rocm-smi --showproductname | grep GUID | wc -l)
155+
local num_gpus=$(amd-smi list | grep -c "^GPU")
156156
# shellcheck disable=SC2155
157157
local gpu_indices=$(seq 0 $((num_gpus - 1)) | paste -sd, -)
158158
# shellcheck disable=SC2086

.github/scripts/utils_rocm.bash

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,10 +100,9 @@ install_rocm_ubuntu () {
100100
echo "[INSTALL] Cleaning up ..."
101101
print_exec rm -f "${package_name}"
102102

103-
echo "[INFO] Printing ROCM utilities info ..."
104-
# If rocm-smi is installed on a machine without GPUs, this will return error
105-
(print_exec rocminfo) || true
106-
(print_exec rocm-smi) || true
103+
echo "[INFO] Printing AMD-SMI utilities info ..."
104+
# If amd-smi is installed on a machine without GPUs, this will return error
105+
(print_exec amd-smi) || true
107106
(print_exec hipcc -v) || true
108107

109108
echo "[INSTALL] Successfully installed ROCm ${rocm_version}"

.github/scripts/utils_system.bash

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -173,26 +173,19 @@ print_gpu_info () {
173173
(lspci -v | grep -e 'Display controller: Advanced') || true
174174

175175
if [[ "${ENFORCE_ROCM_DEVICE}" ]]; then
176-
# Ensure that rocm-smi is available and returns GPU entries
177-
if ! rocm-smi; then
176+
# Ensure that amd-smi is available and returns GPU entries
177+
if ! amd-smi; then
178178
echo "[CHECK] ROCm drivers and ROCm device(s) are required for this workflow, but does not appear to be installed or available!"
179179
return 1
180180
fi
181181

182182
else
183-
if which rocm-smi; then
184-
echo "[CHECK] rocm-smi found; printing info ..."
183+
if which amd-smi; then
184+
echo "[CHECK] amd-smi found; printing info ..."
185185
# If the program is installed on a machine without GPUs, invoking it will return error
186-
(print_exec rocm-smi --showproductname) || true
186+
(print_exec amd-smi --showproductname) || true
187187
else
188-
echo "[CHECK] rocm-smi not found"
189-
fi
190-
191-
if which rocminfo; then
192-
echo "[CHECK] rocminfo found; printing info ..."
193-
(print_exec rocminfo) || true
194-
else
195-
echo "[CHECK] rocminfo not found"
188+
echo "[CHECK] amd-smi not found"
196189
fi
197190
fi
198191
}

ci/utils/gpu_detect.bash

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
detect_gpu_vendor() {
3131
if command -v nvidia-smi &> /dev/null; then
3232
echo "nvidia"
33-
elif command -v rocm-smi &> /dev/null; then
33+
elif command -v amd-smi &> /dev/null; then
3434
echo "amd"
3535
else
3636
echo ""
@@ -111,28 +111,28 @@ detect_nvidia_gpu_model() {
111111

112112
# Detect the GPU model of the first AMD GPU.
113113
#
114-
# This function queries rocm-smi for the GFX Version and maps it to a known
114+
# This function queries amd-smi for the GFX Version and maps it to a known
115115
# GPU model name using the AMD_GFX_MODEL_MAP associative array.
116116
# The returned model name is always lowercased.
117117
#
118118
# Returns:
119119
# Lowercased GPU model (e.g., "mi300", "mi350", "mi250")
120-
# "" - if rocm-smi is not available or no GPU is detected
120+
# "" - if amd-smi is not available or no GPU is detected
121121
#
122122
# Usage:
123123
# source gpu.bash
124124
# model=$(detect_amd_gpu_model)
125125
# echo "GPU model: $model" # e.g., "mi350"
126126
#
127127
detect_amd_gpu_model() {
128-
# Check if rocm-smi is available
129-
if ! command -v rocm-smi &> /dev/null; then
130-
echo "rocm-smi not found; cannot detect AMD GPU model" >&2
128+
# Check if amd-smi is available
129+
if ! command -v amd-smi &> /dev/null; then
130+
echo "amd-smi not found; cannot detect AMD GPU model" >&2
131131
return 1
132132
fi
133133

134134
# Associative array mapping GFX versions to GPU model names.
135-
# Keys are the GFX versions (from "GFX Version" field in rocm-smi --showproductname).
135+
# Keys are the GFX versions (from "TARGET_GRAPHICS_VERSION" field in amd-smi static --asic).
136136
# Values are the desired lowercased model names.
137137
#
138138
# Target architecture, card model, and ROCm compatibility tables can be found
@@ -142,7 +142,7 @@ detect_amd_gpu_model() {
142142
# https://www.coelacanth-dream.com/posts/2019/12/30/did-rid-product-matome-p2/
143143
#
144144
# To find the GFX version for a new GPU, run:
145-
# rocm-smi --showproductname | grep "GFX Version"
145+
# amd-smi static --asic | grep "TARGET_GRAPHICS_VERSION"
146146
#
147147
declare -A AMD_GFX_MODEL_MAP=(
148148
# MI350 series (CDNA 4)
@@ -157,11 +157,13 @@ detect_amd_gpu_model() {
157157
["gfx906"]="mi50"
158158
)
159159

160-
# Get the GFX Version from rocm-smi (first GPU only)
161-
# rocm-smi --showproductname outputs something like:
162-
# GPU[0] : GFX Version: gfx950
160+
# Get the GFX Version from amd-smi (first GPU only)
161+
# amd-smi static --asic outputs something like:
162+
# GPU: 0
163+
# ASIC:
164+
# TARGET_GRAPHICS_VERSION: gfx950
163165
local gfx_version
164-
gfx_version=$(rocm-smi --showproductname 2>/dev/null | grep -m1 "GFX Version:" | sed 's/.*GFX Version:[[:space:]]*//' | xargs)
166+
gfx_version=$(amd-smi static --asic 2>/dev/null | grep -m1 "TARGET_GRAPHICS_VERSION:" | sed 's/.*TARGET_GRAPHICS_VERSION:[[:space:]]*//' | xargs)
165167

166168
if [[ -z "$gfx_version" ]]; then
167169
echo "Could not detect AMD GPU GFX version" >&2
@@ -269,7 +271,7 @@ detect_gpu_count() {
269271
if [[ "${vendor}" == "nvidia" ]]; then
270272
nvidia-smi --query-gpu=index --format=csv,noheader 2>/dev/null | wc -l
271273
elif [[ "${vendor}" == "amd" ]]; then
272-
rocm-smi --showid 2>/dev/null | grep -oP "GPU\[\K[0-9]+" | sort -u | wc -l
274+
amd-smi list 2>/dev/null | grep -c "^GPU"
273275
else
274276
echo 1
275277
fi
@@ -311,7 +313,7 @@ gpu_is_busy() {
311313
fi
312314
elif [[ "${vendor}" == "amd" ]]; then
313315
local util
314-
util=$(rocm-smi -d "${gpu_id}" --showuse 2>/dev/null | grep "GPU use" | awk '{print $NF}' | tr -d '%' || echo "0")
316+
util=$(amd-smi metric -g "${gpu_id}" --usage 2>/dev/null | grep "GFX_ACTIVITY:" | awk '{print $(NF-1)}' || echo "0")
315317
if [[ "${util}" -gt "${util_threshold}" ]]; then
316318
return 0
317319
fi

cmake/modules/GpuCppLibrary.cmake

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -257,8 +257,6 @@ function(gpu_cpp_library)
257257
# Append ROCM includes
258258
target_include_directories(${lib_name} PUBLIC
259259
${FBGEMM_HIP_INCLUDE}
260-
${ROCRAND_INCLUDE}
261-
${ROCM_SMI_INCLUDE}
262260
${args_INCLUDE_DIRS})
263261

264262
else()

fbgemm_gpu/cmake/Hip.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ if(HIP_FOUND)
100100
set(FBGEMM_HIP_INCLUDE ${ROCM_PATH}/include ${FBGEMM_HIP_INCLUDE})
101101
set(FBGEMM_HIP_INCLUDE ${hip_INCLUDE_DIRS} $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}> $<INSTALL_INTERFACE:include> ${FBGEMM_HIP_INCLUDE})
102102

103-
hip_include_directories(${FBGEMM_HIP_INCLUDE} ${ROCRAND_INCLUDE} ${ROCM_SMI_INCLUDE})
103+
hip_include_directories(${FBGEMM_HIP_INCLUDE})
104104

105105
list (APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH})
106106
endif()

fbgemm_gpu/docs/src/fbgemm_gpu/development/InstallationInstructions.rst

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -121,20 +121,28 @@ The AMDGPU display drivers must be installed on the system prior to all other
121121
environment setup. The steps provided by
122122
`AMD <https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.5/page/How_to_Install_ROCm.html>`__
123123
are the most authoritative instructions for doing this. Driver setup may be
124-
verified with the ``rocm-smi`` command:
124+
verified with the ``amd-smi`` command:
125125

126126
.. code:: sh
127127
128-
rocm-smi
129-
130-
======================= ROCm System Management Interface =======================
131-
================================= Concise Info =================================
132-
GPU Temp (DieEdge) AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
133-
0 33.0c 37.0W 300Mhz 1200Mhz 0% auto 290.0W 0% 0%
134-
1 32.0c 39.0W 300Mhz 1200Mhz 0% auto 290.0W 0% 0%
135-
2 33.0c 37.0W 300Mhz 1200Mhz 0% auto 290.0W 0% 0%
136-
================================================================================
137-
============================= End of ROCm SMI Log ==============================
128+
amd-smi
129+
130+
+------------------------------------------------------------------------------+
131+
| AMD-SMI 26.3.0+615aab95ed |
132+
| amdgpu Version: 6.14.19 |
133+
| ROCm Version: 7.3.0 |
134+
| VBIOS Version: 020.001.000.060.000000 |
135+
| Platform: Linux Baremetal |
136+
|-------------------------------------+----------------------------------------|
137+
| BDF GPU-Name | Mem-Uti Temp UEC Power-Usage |
138+
| GPU HIP-ID OAM-ID Partition-Mode | GFX-Uti Fan Mem-Usage |
139+
|=====================================+========================================|
140+
| 0000:43:00.0 AMD Radeon RX 6800 XT | 0 % 32 °C 0 12/272 W |
141+
| 0 1 N/A N/A | 0 % 0.0 % 16/16368 MB |
142+
|-------------------------------------+----------------------------------------|
143+
| 0000:63:00.0 Radeon RX 7900 XT | 0 % 41 °C 0 50/257 W |
144+
| 1 0 N/A N/A | 4 % 0.0 % 26/20464 MB |
145+
+-------------------------------------+----------------------------------------+
138146

139147
Set Up the ROCm Docker Container and Conda Environment
140148
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

fbgemm_gpu/docs/src/fbgemm_gpu/development/TestInstructions.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ For ROCm machines, testing against a ROCm GPU needs to be enabled with
7878
# Specify the specific HIP devices to run the tests on
7979
#
8080
# NOTE: This is necessary if PyTorch is unable to see the devices that
81-
# `rocm-smi --showproductname` can see
81+
# `amd-smi static --asic` can see
8282
export HIP_VISIBLE_DEVICES=0,1,2,3
8383
8484
# Enable for debugging kernel executions

fbgemm_gpu/src/topology_utils.cpp

Lines changed: 39 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -15,40 +15,55 @@
1515

1616
#ifdef USE_ROCM
1717
#include <inttypes.h>
18+
#include "amd_smi/amdsmi.h"
1819
#include "hip/hip_runtime.h"
19-
#include "rocm_smi/rocm_smi.h"
2020

21-
#define RSMI_CHECK(fn) \
22-
do { \
23-
rsmi_status_t ret = (fn); \
24-
TORCH_CHECK_EQ((ret), RSMI_STATUS_SUCCESS); \
21+
#define AMDSMI_CHECK(fn) \
22+
do { \
23+
amdsmi_status_t ret = (fn); \
24+
TORCH_CHECK_EQ((ret), AMDSMI_STATUS_SUCCESS); \
2525
} while (0)
2626

27-
#define RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16
27+
#define AMDSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16
2828

2929
namespace fbgemm_gpu {
3030
AdjacencyMatrix<Links> get_nvlink_matrix() {
3131
auto world_size = at::cuda::getNumGPUs();
32-
RSMI_CHECK(rsmi_init(0));
32+
AMDSMI_CHECK(amdsmi_init(AMDSMI_INIT_AMD_GPUS));
3333

34-
// Note that ROCm_SMI uses a different numbering method to ROCm runtime,
34+
// Note that AMD SMI uses a different numbering method to ROCm runtime,
3535
// so we need to learn the mapping by using the bus ID.
36-
uint32_t device_count;
37-
RSMI_CHECK(rsmi_num_monitor_devices(&device_count));
3836

39-
std::unordered_map<Node, uint32_t> rocm_device_to_rsmi_device;
37+
// Get all sockets, then collect all GPU processor handles across sockets.
38+
uint32_t socket_count = 0;
39+
AMDSMI_CHECK(amdsmi_get_socket_handles(&socket_count, nullptr));
40+
std::vector<amdsmi_socket_handle> sockets(socket_count);
41+
AMDSMI_CHECK(amdsmi_get_socket_handles(&socket_count, sockets.data()));
42+
43+
std::vector<amdsmi_processor_handle> processor_handles;
44+
for (uint32_t s = 0; s < socket_count; s++) {
45+
uint32_t device_count = 0;
46+
AMDSMI_CHECK(amdsmi_get_processor_handles(sockets[s], &device_count, nullptr));
47+
std::vector<amdsmi_processor_handle> socket_handles(device_count);
48+
AMDSMI_CHECK(amdsmi_get_processor_handles(
49+
sockets[s], &device_count, socket_handles.data()));
50+
processor_handles.insert(
51+
processor_handles.end(), socket_handles.begin(), socket_handles.end());
52+
}
4053

41-
for (const auto i : c10::irange(device_count)) {
54+
std::unordered_map<Node, amdsmi_processor_handle> hip_device_to_handle;
55+
56+
for (const auto& handle : processor_handles) {
4257
uint64_t pci_info;
43-
RSMI_CHECK(rsmi_dev_pci_id_get(i, &pci_info));
58+
AMDSMI_CHECK(amdsmi_get_gpu_bdf_id(handle, &pci_info));
4459
uint64_t domain, bus, device, function;
4560
domain = (pci_info >> 32) & 0xffffffff;
4661
bus = (pci_info >> 8) & 0xff;
4762
device = (pci_info >> 3) & 0x1f;
4863
function = pci_info & 0x7;
4964
// Different from CUDA, we do not get the PCI BUS ID as a char* and we need
5065
// to reconstruct it.
51-
char pci_bus_id_str[RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
66+
char pci_bus_id_str[AMDSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
5267
sprintf(
5368
pci_bus_id_str,
5469
"%04" PRIu64 ":%02" PRIu64 ":%02" PRIu64 ".%0" PRIu64,
@@ -57,15 +72,15 @@ AdjacencyMatrix<Links> get_nvlink_matrix() {
5772
device,
5873
function);
5974

60-
std::array<char, RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id;
75+
std::array<char, AMDSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id;
6176
std::copy(
6277
&pci_bus_id_str[0],
63-
&pci_bus_id_str[RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE],
78+
&pci_bus_id_str[AMDSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE],
6479
pci_bus_id.data());
6580
int32_t node = 0;
6681
auto err = hipDeviceGetByPCIBusId(&node, pci_bus_id.data());
6782
if (err == hipSuccess) {
68-
rocm_device_to_rsmi_device.insert({node, i});
83+
hip_device_to_handle.insert({node, handle});
6984
} else {
7085
// flush the last error - this can occur when e.g. we set
7186
// HIP_VISIBLE_DEVICES to a subset of the available GPUs in the system.
@@ -75,22 +90,22 @@ AdjacencyMatrix<Links> get_nvlink_matrix() {
7590

7691
std::vector<Links> links(world_size * world_size);
7792
for (const auto i : c10::irange(world_size)) {
78-
auto src_rsmi_device = rocm_device_to_rsmi_device.find(i);
79-
if (src_rsmi_device != rocm_device_to_rsmi_device.end()) {
93+
auto src = hip_device_to_handle.find(i);
94+
if (src != hip_device_to_handle.end()) {
8095
for (const auto j : c10::irange(world_size)) {
81-
auto dst_rsmi_device = rocm_device_to_rsmi_device.find(j);
82-
if (dst_rsmi_device != rocm_device_to_rsmi_device.end()) {
96+
auto dst = hip_device_to_handle.find(j);
97+
if (dst != hip_device_to_handle.end()) {
8398
bool is_active;
84-
RSMI_CHECK(rsmi_is_P2P_accessible(
85-
src_rsmi_device->second, dst_rsmi_device->second, &is_active));
99+
AMDSMI_CHECK(
100+
amdsmi_is_P2P_accessible(src->second, dst->second, &is_active));
86101
if (is_active) {
87102
links[i * world_size + j] += 1;
88103
}
89104
}
90105
}
91106
}
92107
}
93-
RSMI_CHECK(rsmi_shut_down());
108+
AMDSMI_CHECK(amdsmi_shut_down());
94109
return [=](Node i, Node j) {
95110
TORCH_CHECK_LT(i, world_size);
96111
TORCH_CHECK_LT(j, world_size);

0 commit comments

Comments
 (0)