Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/scripts/fbgemm_gpu_test.bash
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ __configure_fbgemm_gpu_test_rocm () {

# AMD GPUs need to be explicitly made visible to PyTorch for use
# shellcheck disable=SC2155,SC2126
local num_gpus=$(rocm-smi --showproductname | grep GUID | wc -l)
local num_gpus=$(amd-smi list | grep -c "^GPU")
# shellcheck disable=SC2155
local gpu_indices=$(seq 0 $((num_gpus - 1)) | paste -sd, -)
# shellcheck disable=SC2086
Expand Down
7 changes: 3 additions & 4 deletions .github/scripts/utils_rocm.bash
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,9 @@ install_rocm_ubuntu () {
echo "[INSTALL] Cleaning up ..."
print_exec rm -f "${package_name}"

echo "[INFO] Printing ROCM utilities info ..."
# If rocm-smi is installed on a machine without GPUs, this will return error
(print_exec rocminfo) || true
(print_exec rocm-smi) || true
echo "[INFO] Printing AMD-SMI utilities info ..."
# If amd-smi is installed on a machine without GPUs, this will return error
(print_exec amd-smi) || true
(print_exec hipcc -v) || true

echo "[INSTALL] Successfully installed ROCm ${rocm_version}"
Expand Down
19 changes: 6 additions & 13 deletions .github/scripts/utils_system.bash
Original file line number Diff line number Diff line change
Expand Up @@ -173,26 +173,19 @@ print_gpu_info () {
(lspci -v | grep -e 'Display controller: Advanced') || true

if [[ "${ENFORCE_ROCM_DEVICE}" ]]; then
# Ensure that rocm-smi is available and returns GPU entries
if ! rocm-smi; then
# Ensure that amd-smi is available and returns GPU entries
if ! amd-smi; then
echo "[CHECK] ROCm drivers and ROCm device(s) are required for this workflow, but does not appear to be installed or available!"
return 1
fi

else
if which rocm-smi; then
echo "[CHECK] rocm-smi found; printing info ..."
if which amd-smi; then
echo "[CHECK] amd-smi found; printing info ..."
# If the program is installed on a machine without GPUs, invoking it will return error
(print_exec rocm-smi --showproductname) || true
(print_exec amd-smi --showproductname) || true
else
echo "[CHECK] rocm-smi not found"
fi

if which rocminfo; then
echo "[CHECK] rocminfo found; printing info ..."
(print_exec rocminfo) || true
else
echo "[CHECK] rocminfo not found"
echo "[CHECK] amd-smi not found"
fi
fi
}
Expand Down
30 changes: 16 additions & 14 deletions ci/utils/gpu_detect.bash
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
detect_gpu_vendor() {
if command -v nvidia-smi &> /dev/null; then
echo "nvidia"
elif command -v rocm-smi &> /dev/null; then
elif command -v amd-smi &> /dev/null; then
echo "amd"
else
echo ""
Expand Down Expand Up @@ -111,28 +111,28 @@ detect_nvidia_gpu_model() {

# Detect the GPU model of the first AMD GPU.
#
# This function queries rocm-smi for the GFX Version and maps it to a known
# This function queries amd-smi for the GFX Version and maps it to a known
# GPU model name using the AMD_GFX_MODEL_MAP associative array.
# The returned model name is always lowercased.
#
# Returns:
# Lowercased GPU model (e.g., "mi300", "mi350", "mi250")
# "" - if rocm-smi is not available or no GPU is detected
# "" - if amd-smi is not available or no GPU is detected
#
# Usage:
# source gpu.bash
# model=$(detect_amd_gpu_model)
# echo "GPU model: $model" # e.g., "mi350"
#
detect_amd_gpu_model() {
# Check if rocm-smi is available
if ! command -v rocm-smi &> /dev/null; then
echo "rocm-smi not found; cannot detect AMD GPU model" >&2
# Check if amd-smi is available
if ! command -v amd-smi &> /dev/null; then
echo "amd-smi not found; cannot detect AMD GPU model" >&2
return 1
fi

# Associative array mapping GFX versions to GPU model names.
# Keys are the GFX versions (from "GFX Version" field in rocm-smi --showproductname).
# Keys are the GFX versions (from "TARGET_GRAPHICS_VERSION" field in amd-smi static --asic).
# Values are the desired lowercased model names.
#
# Target architecture, card model, and ROCm compatibility tables can be found
Expand All @@ -142,7 +142,7 @@ detect_amd_gpu_model() {
# https://www.coelacanth-dream.com/posts/2019/12/30/did-rid-product-matome-p2/
#
# To find the GFX version for a new GPU, run:
# rocm-smi --showproductname | grep "GFX Version"
# amd-smi static --asic | grep "TARGET_GRAPHICS_VERSION"
#
declare -A AMD_GFX_MODEL_MAP=(
# MI350 series (CDNA 4)
Expand All @@ -157,11 +157,13 @@ detect_amd_gpu_model() {
["gfx906"]="mi50"
)

# Get the GFX Version from rocm-smi (first GPU only)
# rocm-smi --showproductname outputs something like:
# GPU[0] : GFX Version: gfx950
# Get the GFX Version from amd-smi (first GPU only)
# amd-smi static --asic outputs something like:
# GPU: 0
# ASIC:
# TARGET_GRAPHICS_VERSION: gfx950
local gfx_version
gfx_version=$(rocm-smi --showproductname 2>/dev/null | grep -m1 "GFX Version:" | sed 's/.*GFX Version:[[:space:]]*//' | xargs)
gfx_version=$(amd-smi static --asic 2>/dev/null | grep -m1 "TARGET_GRAPHICS_VERSION:" | sed 's/.*TARGET_GRAPHICS_VERSION:[[:space:]]*//' | xargs)

if [[ -z "$gfx_version" ]]; then
echo "Could not detect AMD GPU GFX version" >&2
Expand Down Expand Up @@ -269,7 +271,7 @@ detect_gpu_count() {
if [[ "${vendor}" == "nvidia" ]]; then
nvidia-smi --query-gpu=index --format=csv,noheader 2>/dev/null | wc -l
elif [[ "${vendor}" == "amd" ]]; then
rocm-smi --showid 2>/dev/null | grep -oP "GPU\[\K[0-9]+" | sort -u | wc -l
amd-smi list 2>/dev/null | grep -c "^GPU"
else
echo 1
fi
Expand Down Expand Up @@ -311,7 +313,7 @@ gpu_is_busy() {
fi
elif [[ "${vendor}" == "amd" ]]; then
local util
util=$(rocm-smi -d "${gpu_id}" --showuse 2>/dev/null | grep "GPU use" | awk '{print $NF}' | tr -d '%' || echo "0")
util=$(amd-smi metric -g "${gpu_id}" --usage 2>/dev/null | grep "GFX_ACTIVITY:" | awk '{print $(NF-1)}' || echo "0")
if [[ "${util}" -gt "${util_threshold}" ]]; then
return 0
fi
Expand Down
7 changes: 5 additions & 2 deletions cmake/modules/GpuCppLibrary.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -257,8 +257,6 @@ function(gpu_cpp_library)
# Append ROCM includes
target_include_directories(${lib_name} PUBLIC
${FBGEMM_HIP_INCLUDE}
${ROCRAND_INCLUDE}
${ROCM_SMI_INCLUDE}
${args_INCLUDE_DIRS})

else()
Expand Down Expand Up @@ -391,6 +389,11 @@ function(gpu_cpp_library)
list(APPEND library_dependencies ${NVML_LIB_PATH})
endif()

# Add AMD SMI if available (ROCm builds)
if(FBGEMM_AMDSMI_LIB)
list(APPEND library_dependencies ${FBGEMM_AMDSMI_LIB})
endif()

# Link against the external libraries as needed
target_link_libraries(${lib_name} PRIVATE ${library_dependencies})

Expand Down
6 changes: 5 additions & 1 deletion fbgemm_gpu/cmake/Hip.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ if(HIP_FOUND)
# setup, hcc is only used for linking, but it should be used to
# compile the *_hip.cc files as well.
find_library(FBGEMM_HIP_HCC_LIBRARIES ${hip_library_name} HINTS ${ROCM_PATH}/lib)
find_library(FBGEMM_AMDSMI_LIB amd_smi HINTS ${ROCM_PATH}/lib)
if(FBGEMM_AMDSMI_LIB)
message(STATUS "Found AMD SMI library: ${FBGEMM_AMDSMI_LIB}")
endif()

list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_OPERATORS__=1)
# list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1)
Expand Down Expand Up @@ -100,7 +104,7 @@ if(HIP_FOUND)
set(FBGEMM_HIP_INCLUDE ${ROCM_PATH}/include ${FBGEMM_HIP_INCLUDE})
set(FBGEMM_HIP_INCLUDE ${hip_INCLUDE_DIRS} $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}> $<INSTALL_INTERFACE:include> ${FBGEMM_HIP_INCLUDE})

hip_include_directories(${FBGEMM_HIP_INCLUDE} ${ROCRAND_INCLUDE} ${ROCM_SMI_INCLUDE})
hip_include_directories(${FBGEMM_HIP_INCLUDE})

list (APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH})
endif()
Original file line number Diff line number Diff line change
Expand Up @@ -121,20 +121,28 @@ The AMDGPU display drivers must be installed on the system prior to all other
environment setup. The steps provided by
`AMD <https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.5/page/How_to_Install_ROCm.html>`__
are the most authoritative instructions for doing this. Driver setup may be
verified with the ``rocm-smi`` command:
verified with the ``amd-smi`` command:

.. code:: sh

rocm-smi

======================= ROCm System Management Interface =======================
================================= Concise Info =================================
GPU Temp (DieEdge) AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
0 33.0c 37.0W 300Mhz 1200Mhz 0% auto 290.0W 0% 0%
1 32.0c 39.0W 300Mhz 1200Mhz 0% auto 290.0W 0% 0%
2 33.0c 37.0W 300Mhz 1200Mhz 0% auto 290.0W 0% 0%
================================================================================
============================= End of ROCm SMI Log ==============================
amd-smi

+------------------------------------------------------------------------------+
| AMD-SMI 26.3.0+615aab95ed |
| amdgpu Version: 6.14.19 |
| ROCm Version: 7.3.0 |
| VBIOS Version: 020.001.000.060.000000 |
| Platform: Linux Baremetal |
|-------------------------------------+----------------------------------------|
| BDF GPU-Name | Mem-Uti Temp UEC Power-Usage |
| GPU HIP-ID OAM-ID Partition-Mode | GFX-Uti Fan Mem-Usage |
|=====================================+========================================|
| 0000:43:00.0 AMD Radeon RX 6800 XT | 0 % 32 °C 0 12/272 W |
| 0 1 N/A N/A | 0 % 0.0 % 16/16368 MB |
|-------------------------------------+----------------------------------------|
| 0000:63:00.0 Radeon RX 7900 XT | 0 % 41 °C 0 50/257 W |
| 1 0 N/A N/A | 4 % 0.0 % 26/20464 MB |
+-------------------------------------+----------------------------------------+

Set Up the ROCm Docker Container and Conda Environment
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ For ROCm machines, testing against a ROCm GPU needs to be enabled with
# Specify the specific HIP devices to run the tests on
#
# NOTE: This is necessary if PyTorch is unable to see the devices that
# `rocm-smi --showproductname` can see
# `amd-smi static --asic` can see
export HIP_VISIBLE_DEVICES=0,1,2,3

# Enable for debugging kernel executions
Expand Down
63 changes: 39 additions & 24 deletions fbgemm_gpu/src/topology_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,40 +15,55 @@

#ifdef USE_ROCM
#include <inttypes.h>
#include "amd_smi/amdsmi.h"
#include "hip/hip_runtime.h"
#include "rocm_smi/rocm_smi.h"

#define RSMI_CHECK(fn) \
do { \
rsmi_status_t ret = (fn); \
TORCH_CHECK_EQ((ret), RSMI_STATUS_SUCCESS); \
#define AMDSMI_CHECK(fn) \
do { \
amdsmi_status_t ret = (fn); \
TORCH_CHECK_EQ((ret), AMDSMI_STATUS_SUCCESS); \
} while (0)

#define RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16
#define AMDSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16

namespace fbgemm_gpu {
AdjacencyMatrix<Links> get_nvlink_matrix() {
auto world_size = at::cuda::getNumGPUs();
RSMI_CHECK(rsmi_init(0));
AMDSMI_CHECK(amdsmi_init(AMDSMI_INIT_AMD_GPUS));

// Note that ROCm_SMI uses a different numbering method to ROCm runtime,
// Note that AMD SMI uses a different numbering method to ROCm runtime,
// so we need to learn the mapping by using the bus ID.
uint32_t device_count;
RSMI_CHECK(rsmi_num_monitor_devices(&device_count));

std::unordered_map<Node, uint32_t> rocm_device_to_rsmi_device;
// Get all sockets, then collect all GPU processor handles across sockets.
uint32_t socket_count = 0;
AMDSMI_CHECK(amdsmi_get_socket_handles(&socket_count, nullptr));
std::vector<amdsmi_socket_handle> sockets(socket_count);
AMDSMI_CHECK(amdsmi_get_socket_handles(&socket_count, sockets.data()));

std::vector<amdsmi_processor_handle> processor_handles;
for (uint32_t s = 0; s < socket_count; s++) {
uint32_t device_count = 0;
AMDSMI_CHECK(amdsmi_get_processor_handles(sockets[s], &device_count, nullptr));
std::vector<amdsmi_processor_handle> socket_handles(device_count);
AMDSMI_CHECK(amdsmi_get_processor_handles(
sockets[s], &device_count, socket_handles.data()));
processor_handles.insert(
processor_handles.end(), socket_handles.begin(), socket_handles.end());
}

for (const auto i : c10::irange(device_count)) {
std::unordered_map<Node, amdsmi_processor_handle> hip_device_to_handle;

for (const auto& handle : processor_handles) {
uint64_t pci_info;
RSMI_CHECK(rsmi_dev_pci_id_get(i, &pci_info));
AMDSMI_CHECK(amdsmi_get_gpu_bdf_id(handle, &pci_info));
uint64_t domain, bus, device, function;
domain = (pci_info >> 32) & 0xffffffff;
bus = (pci_info >> 8) & 0xff;
device = (pci_info >> 3) & 0x1f;
function = pci_info & 0x7;
// Different from CUDA, we do not get the PCI BUS ID as a char* and we need
// to reconstruct it.
char pci_bus_id_str[RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
char pci_bus_id_str[AMDSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
sprintf(
pci_bus_id_str,
"%04" PRIu64 ":%02" PRIu64 ":%02" PRIu64 ".%0" PRIu64,
Expand All @@ -57,15 +72,15 @@ AdjacencyMatrix<Links> get_nvlink_matrix() {
device,
function);

std::array<char, RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id;
std::array<char, AMDSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id;
std::copy(
&pci_bus_id_str[0],
&pci_bus_id_str[RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE],
&pci_bus_id_str[AMDSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE],
pci_bus_id.data());
int32_t node = 0;
auto err = hipDeviceGetByPCIBusId(&node, pci_bus_id.data());
if (err == hipSuccess) {
rocm_device_to_rsmi_device.insert({node, i});
hip_device_to_handle.insert({node, handle});
} else {
// flush the last error - this can occur when e.g. we set
// HIP_VISIBLE_DEVICES to a subset of the available GPUs in the system.
Expand All @@ -75,22 +90,22 @@ AdjacencyMatrix<Links> get_nvlink_matrix() {

std::vector<Links> links(world_size * world_size);
for (const auto i : c10::irange(world_size)) {
auto src_rsmi_device = rocm_device_to_rsmi_device.find(i);
if (src_rsmi_device != rocm_device_to_rsmi_device.end()) {
auto src = hip_device_to_handle.find(i);
if (src != hip_device_to_handle.end()) {
for (const auto j : c10::irange(world_size)) {
auto dst_rsmi_device = rocm_device_to_rsmi_device.find(j);
if (dst_rsmi_device != rocm_device_to_rsmi_device.end()) {
auto dst = hip_device_to_handle.find(j);
if (dst != hip_device_to_handle.end()) {
bool is_active;
RSMI_CHECK(rsmi_is_P2P_accessible(
src_rsmi_device->second, dst_rsmi_device->second, &is_active));
AMDSMI_CHECK(
amdsmi_is_P2P_accessible(src->second, dst->second, &is_active));
if (is_active) {
links[i * world_size + j] += 1;
}
}
}
}
}
RSMI_CHECK(rsmi_shut_down());
AMDSMI_CHECK(amdsmi_shut_down());
return [=](Node i, Node j) {
TORCH_CHECK_LT(i, world_size);
TORCH_CHECK_LT(j, world_size);
Expand Down
Loading