diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash index 984e364ca5..d10ed0a96f 100644 --- a/.github/scripts/fbgemm_gpu_test.bash +++ b/.github/scripts/fbgemm_gpu_test.bash @@ -152,7 +152,7 @@ __configure_fbgemm_gpu_test_rocm () { # AMD GPUs need to be explicitly made visible to PyTorch for use # shellcheck disable=SC2155,SC2126 - local num_gpus=$(rocm-smi --showproductname | grep GUID | wc -l) + local num_gpus=$(amd-smi list | grep -c "^GPU") # shellcheck disable=SC2155 local gpu_indices=$(seq 0 $((num_gpus - 1)) | paste -sd, -) # shellcheck disable=SC2086 diff --git a/.github/scripts/utils_rocm.bash b/.github/scripts/utils_rocm.bash index 9836b60770..8e0d4d2e15 100644 --- a/.github/scripts/utils_rocm.bash +++ b/.github/scripts/utils_rocm.bash @@ -100,10 +100,9 @@ install_rocm_ubuntu () { echo "[INSTALL] Cleaning up ..." print_exec rm -f "${package_name}" - echo "[INFO] Printing ROCM utilities info ..." - # If rocm-smi is installed on a machine without GPUs, this will return error - (print_exec rocminfo) || true - (print_exec rocm-smi) || true + echo "[INFO] Printing AMD-SMI utilities info ..." + # If amd-smi is installed on a machine without GPUs, this will return error + (print_exec amd-smi) || true (print_exec hipcc -v) || true echo "[INSTALL] Successfully installed ROCm ${rocm_version}" diff --git a/.github/scripts/utils_system.bash b/.github/scripts/utils_system.bash index 554b510c99..d2c4d33d0d 100644 --- a/.github/scripts/utils_system.bash +++ b/.github/scripts/utils_system.bash @@ -173,26 +173,19 @@ print_gpu_info () { (lspci -v | grep -e 'Display controller: Advanced') || true if [[ "${ENFORCE_ROCM_DEVICE}" ]]; then - # Ensure that rocm-smi is available and returns GPU entries - if ! rocm-smi; then + # Ensure that amd-smi is available and returns GPU entries + if ! amd-smi; then echo "[CHECK] ROCm drivers and ROCm device(s) are required for this workflow, but does not appear to be installed or available!" return 1 fi else - if which rocm-smi; then - echo "[CHECK] rocm-smi found; printing info ..." + if which amd-smi; then + echo "[CHECK] amd-smi found; printing info ..." # If the program is installed on a machine without GPUs, invoking it will return error - (print_exec rocm-smi --showproductname) || true + (print_exec amd-smi --showproductname) || true else - echo "[CHECK] rocm-smi not found" - fi - - if which rocminfo; then - echo "[CHECK] rocminfo found; printing info ..." - (print_exec rocminfo) || true - else - echo "[CHECK] rocminfo not found" + echo "[CHECK] amd-smi not found" fi fi } diff --git a/ci/utils/gpu_detect.bash b/ci/utils/gpu_detect.bash index 2e0d0c4c23..79333eceaa 100644 --- a/ci/utils/gpu_detect.bash +++ b/ci/utils/gpu_detect.bash @@ -30,7 +30,7 @@ detect_gpu_vendor() { if command -v nvidia-smi &> /dev/null; then echo "nvidia" - elif command -v rocm-smi &> /dev/null; then + elif command -v amd-smi &> /dev/null; then echo "amd" else echo "" @@ -111,13 +111,13 @@ detect_nvidia_gpu_model() { # Detect the GPU model of the first AMD GPU. # -# This function queries rocm-smi for the GFX Version and maps it to a known +# This function queries amd-smi for the GFX Version and maps it to a known # GPU model name using the AMD_GFX_MODEL_MAP associative array. # The returned model name is always lowercased. # # Returns: # Lowercased GPU model (e.g., "mi300", "mi350", "mi250") -# "" - if rocm-smi is not available or no GPU is detected +# "" - if amd-smi is not available or no GPU is detected # # Usage: # source gpu.bash @@ -125,14 +125,14 @@ detect_nvidia_gpu_model() { # echo "GPU model: $model" # e.g., "mi350" # detect_amd_gpu_model() { - # Check if rocm-smi is available - if ! command -v rocm-smi &> /dev/null; then - echo "rocm-smi not found; cannot detect AMD GPU model" >&2 + # Check if amd-smi is available + if ! command -v amd-smi &> /dev/null; then + echo "amd-smi not found; cannot detect AMD GPU model" >&2 return 1 fi # Associative array mapping GFX versions to GPU model names. - # Keys are the GFX versions (from "GFX Version" field in rocm-smi --showproductname). + # Keys are the GFX versions (from "TARGET_GRAPHICS_VERSION" field in amd-smi static --asic). # Values are the desired lowercased model names. # # Target architecture, card model, and ROCm compatibility tables can be found @@ -142,7 +142,7 @@ detect_amd_gpu_model() { # https://www.coelacanth-dream.com/posts/2019/12/30/did-rid-product-matome-p2/ # # To find the GFX version for a new GPU, run: - # rocm-smi --showproductname | grep "GFX Version" + # amd-smi static --asic | grep "TARGET_GRAPHICS_VERSION" # declare -A AMD_GFX_MODEL_MAP=( # MI350 series (CDNA 4) @@ -157,11 +157,13 @@ detect_amd_gpu_model() { ["gfx906"]="mi50" ) - # Get the GFX Version from rocm-smi (first GPU only) - # rocm-smi --showproductname outputs something like: - # GPU[0] : GFX Version: gfx950 + # Get the GFX Version from amd-smi (first GPU only) + # amd-smi static --asic outputs something like: + # GPU: 0 + # ASIC: + # TARGET_GRAPHICS_VERSION: gfx950 local gfx_version - gfx_version=$(rocm-smi --showproductname 2>/dev/null | grep -m1 "GFX Version:" | sed 's/.*GFX Version:[[:space:]]*//' | xargs) + gfx_version=$(amd-smi static --asic 2>/dev/null | grep -m1 "TARGET_GRAPHICS_VERSION:" | sed 's/.*TARGET_GRAPHICS_VERSION:[[:space:]]*//' | xargs) if [[ -z "$gfx_version" ]]; then echo "Could not detect AMD GPU GFX version" >&2 @@ -269,7 +271,7 @@ detect_gpu_count() { if [[ "${vendor}" == "nvidia" ]]; then nvidia-smi --query-gpu=index --format=csv,noheader 2>/dev/null | wc -l elif [[ "${vendor}" == "amd" ]]; then - rocm-smi --showid 2>/dev/null | grep -oP "GPU\[\K[0-9]+" | sort -u | wc -l + amd-smi list 2>/dev/null | grep -c "^GPU" else echo 1 fi @@ -311,7 +313,7 @@ gpu_is_busy() { fi elif [[ "${vendor}" == "amd" ]]; then local util - util=$(rocm-smi -d "${gpu_id}" --showuse 2>/dev/null | grep "GPU use" | awk '{print $NF}' | tr -d '%' || echo "0") + util=$(amd-smi metric -g "${gpu_id}" --usage 2>/dev/null | grep "GFX_ACTIVITY:" | awk '{print $(NF-1)}' || echo "0") if [[ "${util}" -gt "${util_threshold}" ]]; then return 0 fi diff --git a/cmake/modules/GpuCppLibrary.cmake b/cmake/modules/GpuCppLibrary.cmake index f05ad3dfb4..7c38fd72b1 100644 --- a/cmake/modules/GpuCppLibrary.cmake +++ b/cmake/modules/GpuCppLibrary.cmake @@ -257,8 +257,6 @@ function(gpu_cpp_library) # Append ROCM includes target_include_directories(${lib_name} PUBLIC ${FBGEMM_HIP_INCLUDE} - ${ROCRAND_INCLUDE} - ${ROCM_SMI_INCLUDE} ${args_INCLUDE_DIRS}) else() @@ -391,6 +389,11 @@ function(gpu_cpp_library) list(APPEND library_dependencies ${NVML_LIB_PATH}) endif() + # Add AMD SMI if available (ROCm builds) + if(FBGEMM_AMDSMI_LIB) + list(APPEND library_dependencies ${FBGEMM_AMDSMI_LIB}) + endif() + # Link against the external libraries as needed target_link_libraries(${lib_name} PRIVATE ${library_dependencies}) diff --git a/fbgemm_gpu/cmake/Hip.cmake b/fbgemm_gpu/cmake/Hip.cmake index e81377d4bd..445dd99d24 100644 --- a/fbgemm_gpu/cmake/Hip.cmake +++ b/fbgemm_gpu/cmake/Hip.cmake @@ -68,6 +68,10 @@ if(HIP_FOUND) # setup, hcc is only used for linking, but it should be used to # compile the *_hip.cc files as well. find_library(FBGEMM_HIP_HCC_LIBRARIES ${hip_library_name} HINTS ${ROCM_PATH}/lib) + find_library(FBGEMM_AMDSMI_LIB amd_smi HINTS ${ROCM_PATH}/lib) + if(FBGEMM_AMDSMI_LIB) + message(STATUS "Found AMD SMI library: ${FBGEMM_AMDSMI_LIB}") + endif() list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_OPERATORS__=1) # list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1) @@ -100,7 +104,7 @@ if(HIP_FOUND) set(FBGEMM_HIP_INCLUDE ${ROCM_PATH}/include ${FBGEMM_HIP_INCLUDE}) set(FBGEMM_HIP_INCLUDE ${hip_INCLUDE_DIRS} $ $ ${FBGEMM_HIP_INCLUDE}) - hip_include_directories(${FBGEMM_HIP_INCLUDE} ${ROCRAND_INCLUDE} ${ROCM_SMI_INCLUDE}) + hip_include_directories(${FBGEMM_HIP_INCLUDE}) list (APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH}) endif() diff --git a/fbgemm_gpu/docs/src/fbgemm_gpu/development/InstallationInstructions.rst b/fbgemm_gpu/docs/src/fbgemm_gpu/development/InstallationInstructions.rst index bfa8fbea42..3ea82179f9 100644 --- a/fbgemm_gpu/docs/src/fbgemm_gpu/development/InstallationInstructions.rst +++ b/fbgemm_gpu/docs/src/fbgemm_gpu/development/InstallationInstructions.rst @@ -121,20 +121,28 @@ The AMDGPU display drivers must be installed on the system prior to all other environment setup. The steps provided by `AMD `__ are the most authoritative instructions for doing this. Driver setup may be -verified with the ``rocm-smi`` command: +verified with the ``amd-smi`` command: .. code:: sh - rocm-smi - - ======================= ROCm System Management Interface ======================= - ================================= Concise Info ================================= - GPU Temp (DieEdge) AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% - 0 33.0c 37.0W 300Mhz 1200Mhz 0% auto 290.0W 0% 0% - 1 32.0c 39.0W 300Mhz 1200Mhz 0% auto 290.0W 0% 0% - 2 33.0c 37.0W 300Mhz 1200Mhz 0% auto 290.0W 0% 0% - ================================================================================ - ============================= End of ROCm SMI Log ============================== + amd-smi + ++------------------------------------------------------------------------------+ +| AMD-SMI 26.3.0+615aab95ed | +| amdgpu Version: 6.14.19 | +| ROCm Version: 7.3.0 | +| VBIOS Version: 020.001.000.060.000000 | +| Platform: Linux Baremetal | +|-------------------------------------+----------------------------------------| +| BDF GPU-Name | Mem-Uti Temp UEC Power-Usage | +| GPU HIP-ID OAM-ID Partition-Mode | GFX-Uti Fan Mem-Usage | +|=====================================+========================================| +| 0000:43:00.0 AMD Radeon RX 6800 XT | 0 % 32 °C 0 12/272 W | +| 0 1 N/A N/A | 0 % 0.0 % 16/16368 MB | +|-------------------------------------+----------------------------------------| +| 0000:63:00.0 Radeon RX 7900 XT | 0 % 41 °C 0 50/257 W | +| 1 0 N/A N/A | 4 % 0.0 % 26/20464 MB | ++-------------------------------------+----------------------------------------+ Set Up the ROCm Docker Container and Conda Environment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/fbgemm_gpu/docs/src/fbgemm_gpu/development/TestInstructions.rst b/fbgemm_gpu/docs/src/fbgemm_gpu/development/TestInstructions.rst index f66038fdf9..1ac3bdcc7e 100644 --- a/fbgemm_gpu/docs/src/fbgemm_gpu/development/TestInstructions.rst +++ b/fbgemm_gpu/docs/src/fbgemm_gpu/development/TestInstructions.rst @@ -78,7 +78,7 @@ For ROCm machines, testing against a ROCm GPU needs to be enabled with # Specify the specific HIP devices to run the tests on # # NOTE: This is necessary if PyTorch is unable to see the devices that - # `rocm-smi --showproductname` can see + # `amd-smi static --asic` can see export HIP_VISIBLE_DEVICES=0,1,2,3 # Enable for debugging kernel executions diff --git a/fbgemm_gpu/src/topology_utils.cpp b/fbgemm_gpu/src/topology_utils.cpp index 240caaf8cf..82d2936b00 100644 --- a/fbgemm_gpu/src/topology_utils.cpp +++ b/fbgemm_gpu/src/topology_utils.cpp @@ -15,32 +15,47 @@ #ifdef USE_ROCM #include +#include "amd_smi/amdsmi.h" #include "hip/hip_runtime.h" -#include "rocm_smi/rocm_smi.h" -#define RSMI_CHECK(fn) \ - do { \ - rsmi_status_t ret = (fn); \ - TORCH_CHECK_EQ((ret), RSMI_STATUS_SUCCESS); \ +#define AMDSMI_CHECK(fn) \ + do { \ + amdsmi_status_t ret = (fn); \ + TORCH_CHECK_EQ((ret), AMDSMI_STATUS_SUCCESS); \ } while (0) -#define RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16 +#define AMDSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16 namespace fbgemm_gpu { AdjacencyMatrix get_nvlink_matrix() { auto world_size = at::cuda::getNumGPUs(); - RSMI_CHECK(rsmi_init(0)); + AMDSMI_CHECK(amdsmi_init(AMDSMI_INIT_AMD_GPUS)); - // Note that ROCm_SMI uses a different numbering method to ROCm runtime, + // Note that AMD SMI uses a different numbering method to ROCm runtime, // so we need to learn the mapping by using the bus ID. - uint32_t device_count; - RSMI_CHECK(rsmi_num_monitor_devices(&device_count)); - std::unordered_map rocm_device_to_rsmi_device; + // Get all sockets, then collect all GPU processor handles across sockets. + uint32_t socket_count = 0; + AMDSMI_CHECK(amdsmi_get_socket_handles(&socket_count, nullptr)); + std::vector sockets(socket_count); + AMDSMI_CHECK(amdsmi_get_socket_handles(&socket_count, sockets.data())); + + std::vector processor_handles; + for (uint32_t s = 0; s < socket_count; s++) { + uint32_t device_count = 0; + AMDSMI_CHECK(amdsmi_get_processor_handles(sockets[s], &device_count, nullptr)); + std::vector socket_handles(device_count); + AMDSMI_CHECK(amdsmi_get_processor_handles( + sockets[s], &device_count, socket_handles.data())); + processor_handles.insert( + processor_handles.end(), socket_handles.begin(), socket_handles.end()); + } - for (const auto i : c10::irange(device_count)) { + std::unordered_map hip_device_to_handle; + + for (const auto& handle : processor_handles) { uint64_t pci_info; - RSMI_CHECK(rsmi_dev_pci_id_get(i, &pci_info)); + AMDSMI_CHECK(amdsmi_get_gpu_bdf_id(handle, &pci_info)); uint64_t domain, bus, device, function; domain = (pci_info >> 32) & 0xffffffff; bus = (pci_info >> 8) & 0xff; @@ -48,7 +63,7 @@ AdjacencyMatrix get_nvlink_matrix() { function = pci_info & 0x7; // Different from CUDA, we do not get the PCI BUS ID as a char* and we need // to reconstruct it. - char pci_bus_id_str[RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + char pci_bus_id_str[AMDSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; sprintf( pci_bus_id_str, "%04" PRIu64 ":%02" PRIu64 ":%02" PRIu64 ".%0" PRIu64, @@ -57,15 +72,15 @@ AdjacencyMatrix get_nvlink_matrix() { device, function); - std::array pci_bus_id; + std::array pci_bus_id; std::copy( &pci_bus_id_str[0], - &pci_bus_id_str[RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE], + &pci_bus_id_str[AMDSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE], pci_bus_id.data()); int32_t node = 0; auto err = hipDeviceGetByPCIBusId(&node, pci_bus_id.data()); if (err == hipSuccess) { - rocm_device_to_rsmi_device.insert({node, i}); + hip_device_to_handle.insert({node, handle}); } else { // flush the last error - this can occur when e.g. we set // HIP_VISIBLE_DEVICES to a subset of the available GPUs in the system. @@ -75,14 +90,14 @@ AdjacencyMatrix get_nvlink_matrix() { std::vector links(world_size * world_size); for (const auto i : c10::irange(world_size)) { - auto src_rsmi_device = rocm_device_to_rsmi_device.find(i); - if (src_rsmi_device != rocm_device_to_rsmi_device.end()) { + auto src = hip_device_to_handle.find(i); + if (src != hip_device_to_handle.end()) { for (const auto j : c10::irange(world_size)) { - auto dst_rsmi_device = rocm_device_to_rsmi_device.find(j); - if (dst_rsmi_device != rocm_device_to_rsmi_device.end()) { + auto dst = hip_device_to_handle.find(j); + if (dst != hip_device_to_handle.end()) { bool is_active; - RSMI_CHECK(rsmi_is_P2P_accessible( - src_rsmi_device->second, dst_rsmi_device->second, &is_active)); + AMDSMI_CHECK( + amdsmi_is_P2P_accessible(src->second, dst->second, &is_active)); if (is_active) { links[i * world_size + j] += 1; } @@ -90,7 +105,7 @@ AdjacencyMatrix get_nvlink_matrix() { } } } - RSMI_CHECK(rsmi_shut_down()); + AMDSMI_CHECK(amdsmi_shut_down()); return [=](Node i, Node j) { TORCH_CHECK_LT(i, world_size); TORCH_CHECK_LT(j, world_size);