diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
index 984e364ca5..d10ed0a96f 100644
--- a/.github/scripts/fbgemm_gpu_test.bash
+++ b/.github/scripts/fbgemm_gpu_test.bash
@@ -152,7 +152,7 @@ __configure_fbgemm_gpu_test_rocm () {
 
   # AMD GPUs need to be explicitly made visible to PyTorch for use
   # shellcheck disable=SC2155,SC2126
-  local num_gpus=$(rocm-smi --showproductname | grep GUID | wc -l)
+  local num_gpus=$(amd-smi list | grep -c "^GPU")
   # shellcheck disable=SC2155
   local gpu_indices=$(seq 0 $((num_gpus - 1)) | paste -sd, -)
   # shellcheck disable=SC2086
diff --git a/.github/scripts/utils_rocm.bash b/.github/scripts/utils_rocm.bash
index 9836b60770..8e0d4d2e15 100644
--- a/.github/scripts/utils_rocm.bash
+++ b/.github/scripts/utils_rocm.bash
@@ -100,10 +100,9 @@ install_rocm_ubuntu () {
   echo "[INSTALL] Cleaning up ..."
   print_exec rm -f "${package_name}"
 
-  echo "[INFO] Printing ROCM utilities info ..."
-  # If rocm-smi is installed on a machine without GPUs, this will return error
-  (print_exec rocminfo) || true
-  (print_exec rocm-smi) || true
+  echo "[INFO] Printing AMD-SMI utilities info ..."
+  # If amd-smi is installed on a machine without GPUs, this will return error
+  (print_exec amd-smi) || true
   (print_exec hipcc -v) || true
 
   echo "[INSTALL] Successfully installed ROCm ${rocm_version}"
diff --git a/.github/scripts/utils_system.bash b/.github/scripts/utils_system.bash
index 554b510c99..d2c4d33d0d 100644
--- a/.github/scripts/utils_system.bash
+++ b/.github/scripts/utils_system.bash
@@ -173,26 +173,19 @@ print_gpu_info () {
   (lspci -v | grep -e 'Display controller: Advanced') || true
 
   if [[ "${ENFORCE_ROCM_DEVICE}" ]]; then
-    # Ensure that rocm-smi is available and returns GPU entries
-    if ! rocm-smi; then
+    # Ensure that amd-smi is available and returns GPU entries
+    if ! amd-smi; then
       echo "[CHECK] ROCm drivers and ROCm device(s) are required for this workflow, but does not appear to be installed or available!"
       return 1
     fi
 
   else
-    if which rocm-smi; then
-      echo "[CHECK] rocm-smi found; printing info ..."
+    if which amd-smi; then
+      echo "[CHECK] amd-smi found; printing info ..."
       # If the program is installed on a machine without GPUs, invoking it will return error
-      (print_exec rocm-smi --showproductname) || true
+      (print_exec amd-smi --showproductname) || true
     else
-      echo "[CHECK] rocm-smi not found"
-    fi
-
-    if which rocminfo; then
-      echo "[CHECK] rocminfo found; printing info ..."
-      (print_exec rocminfo) || true
-    else
-      echo "[CHECK] rocminfo not found"
+      echo "[CHECK] amd-smi not found"
     fi
   fi
 }
diff --git a/ci/utils/gpu_detect.bash b/ci/utils/gpu_detect.bash
index 2e0d0c4c23..79333eceaa 100644
--- a/ci/utils/gpu_detect.bash
+++ b/ci/utils/gpu_detect.bash
@@ -30,7 +30,7 @@
 detect_gpu_vendor() {
     if command -v nvidia-smi &> /dev/null; then
         echo "nvidia"
-    elif command -v rocm-smi &> /dev/null; then
+    elif command -v amd-smi &> /dev/null; then
         echo "amd"
     else
         echo ""
@@ -111,13 +111,13 @@ detect_nvidia_gpu_model() {
 
 # Detect the GPU model of the first AMD GPU.
 #
-# This function queries rocm-smi for the GFX Version and maps it to a known
+# This function queries amd-smi for the GFX Version and maps it to a known
 # GPU model name using the AMD_GFX_MODEL_MAP associative array.
 # The returned model name is always lowercased.
 #
 # Returns:
 #   Lowercased GPU model (e.g., "mi300", "mi350", "mi250")
-#   "" - if rocm-smi is not available or no GPU is detected
+#   "" - if amd-smi is not available or no GPU is detected
 #
 # Usage:
 #   source gpu.bash
@@ -125,14 +125,14 @@ detect_nvidia_gpu_model() {
 #   echo "GPU model: $model"  # e.g., "mi350"
 #
 detect_amd_gpu_model() {
-    # Check if rocm-smi is available
-    if ! command -v rocm-smi &> /dev/null; then
-        echo "rocm-smi not found; cannot detect AMD GPU model" >&2
+    # Check if amd-smi is available
+    if ! command -v amd-smi &> /dev/null; then
+        echo "amd-smi not found; cannot detect AMD GPU model" >&2
         return 1
     fi
 
     # Associative array mapping GFX versions to GPU model names.
-    # Keys are the GFX versions (from "GFX Version" field in rocm-smi --showproductname).
+    # Keys are the GFX versions (from "TARGET_GRAPHICS_VERSION" field in amd-smi static --asic).
     # Values are the desired lowercased model names.
     #
     # Target architecture, card model, and ROCm compatibility tables can be found
@@ -142,7 +142,7 @@ detect_amd_gpu_model() {
     #   https://www.coelacanth-dream.com/posts/2019/12/30/did-rid-product-matome-p2/
     #
     # To find the GFX version for a new GPU, run:
-    #   rocm-smi --showproductname | grep "GFX Version"
+    #   amd-smi static --asic | grep "TARGET_GRAPHICS_VERSION"
     #
     declare -A AMD_GFX_MODEL_MAP=(
         # MI350 series (CDNA 4)
@@ -157,11 +157,13 @@ detect_amd_gpu_model() {
         ["gfx906"]="mi50"
     )
 
-    # Get the GFX Version from rocm-smi (first GPU only)
-    # rocm-smi --showproductname outputs something like:
-    #   GPU[0]          : GFX Version:       gfx950
+    # Get the GFX Version from amd-smi (first GPU only)
+    # amd-smi static --asic outputs something like:
+    #   GPU: 0
+    #       ASIC:
+    #           TARGET_GRAPHICS_VERSION: gfx950
     local gfx_version
-    gfx_version=$(rocm-smi --showproductname 2>/dev/null | grep -m1 "GFX Version:" | sed 's/.*GFX Version:[[:space:]]*//' | xargs)
+    gfx_version=$(amd-smi static --asic 2>/dev/null | grep -m1 "TARGET_GRAPHICS_VERSION:" | sed 's/.*TARGET_GRAPHICS_VERSION:[[:space:]]*//' | xargs)
 
     if [[ -z "$gfx_version" ]]; then
         echo "Could not detect AMD GPU GFX version" >&2
@@ -269,7 +271,7 @@ detect_gpu_count() {
     if [[ "${vendor}" == "nvidia" ]]; then
         nvidia-smi --query-gpu=index --format=csv,noheader 2>/dev/null | wc -l
     elif [[ "${vendor}" == "amd" ]]; then
-        rocm-smi --showid 2>/dev/null | grep -oP "GPU\[\K[0-9]+" | sort -u | wc -l
+        amd-smi list 2>/dev/null | grep -c "^GPU"
     else
         echo 1
     fi
@@ -311,7 +313,7 @@ gpu_is_busy() {
         fi
     elif [[ "${vendor}" == "amd" ]]; then
         local util
-        util=$(rocm-smi -d "${gpu_id}" --showuse 2>/dev/null | grep "GPU use" | awk '{print $NF}' | tr -d '%' || echo "0")
+        util=$(amd-smi metric -g "${gpu_id}" --usage 2>/dev/null | grep "GFX_ACTIVITY:" | awk '{print $(NF-1)}' || echo "0")
         if [[ "${util}" -gt "${util_threshold}" ]]; then
             return 0
         fi
diff --git a/cmake/modules/GpuCppLibrary.cmake b/cmake/modules/GpuCppLibrary.cmake
index f05ad3dfb4..7c38fd72b1 100644
--- a/cmake/modules/GpuCppLibrary.cmake
+++ b/cmake/modules/GpuCppLibrary.cmake
@@ -257,8 +257,6 @@ function(gpu_cpp_library)
         # Append ROCM includes
         target_include_directories(${lib_name} PUBLIC
             ${FBGEMM_HIP_INCLUDE}
-            ${ROCRAND_INCLUDE}
-            ${ROCM_SMI_INCLUDE}
             ${args_INCLUDE_DIRS})
 
     else()
@@ -391,6 +389,11 @@ function(gpu_cpp_library)
         list(APPEND library_dependencies ${NVML_LIB_PATH})
     endif()
 
+    # Add AMD SMI if available (ROCm builds)
+    if(FBGEMM_AMDSMI_LIB)
+        list(APPEND library_dependencies ${FBGEMM_AMDSMI_LIB})
+    endif()
+
     # Link against the external libraries as needed
     target_link_libraries(${lib_name} PRIVATE ${library_dependencies})
 
diff --git a/fbgemm_gpu/cmake/Hip.cmake b/fbgemm_gpu/cmake/Hip.cmake
index e81377d4bd..445dd99d24 100644
--- a/fbgemm_gpu/cmake/Hip.cmake
+++ b/fbgemm_gpu/cmake/Hip.cmake
@@ -68,6 +68,10 @@ if(HIP_FOUND)
   # setup, hcc is only used for linking, but it should be used to
   # compile the *_hip.cc files as well.
   find_library(FBGEMM_HIP_HCC_LIBRARIES ${hip_library_name} HINTS ${ROCM_PATH}/lib)
+  find_library(FBGEMM_AMDSMI_LIB amd_smi HINTS ${ROCM_PATH}/lib)
+  if(FBGEMM_AMDSMI_LIB)
+    message(STATUS "Found AMD SMI library: ${FBGEMM_AMDSMI_LIB}")
+  endif()
 
   list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_OPERATORS__=1)
   # list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1)
@@ -100,7 +104,7 @@ if(HIP_FOUND)
   set(FBGEMM_HIP_INCLUDE ${ROCM_PATH}/include ${FBGEMM_HIP_INCLUDE})
   set(FBGEMM_HIP_INCLUDE ${hip_INCLUDE_DIRS} $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}> $<INSTALL_INTERFACE:include> ${FBGEMM_HIP_INCLUDE})
 
-  hip_include_directories(${FBGEMM_HIP_INCLUDE} ${ROCRAND_INCLUDE} ${ROCM_SMI_INCLUDE})
+  hip_include_directories(${FBGEMM_HIP_INCLUDE})
 
   list (APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH})
 endif()
diff --git a/fbgemm_gpu/docs/src/fbgemm_gpu/development/InstallationInstructions.rst b/fbgemm_gpu/docs/src/fbgemm_gpu/development/InstallationInstructions.rst
index bfa8fbea42..3ea82179f9 100644
--- a/fbgemm_gpu/docs/src/fbgemm_gpu/development/InstallationInstructions.rst
+++ b/fbgemm_gpu/docs/src/fbgemm_gpu/development/InstallationInstructions.rst
@@ -121,20 +121,28 @@ The AMDGPU display drivers must be installed on the system prior to all other
 environment setup. The steps provided by
 `AMD <https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.5/page/How_to_Install_ROCm.html>`__
 are the most authoritative instructions for doing this. Driver setup may be
-verified with the ``rocm-smi`` command:
+verified with the ``amd-smi`` command:
 
 .. code:: sh
 
-  rocm-smi
-
-  ======================= ROCm System Management Interface =======================
-  ================================= Concise Info =================================
-  GPU  Temp (DieEdge)  AvgPwr  SCLK    MCLK     Fan  Perf  PwrCap  VRAM%  GPU%
-  0    33.0c           37.0W   300Mhz  1200Mhz  0%   auto  290.0W    0%   0%
-  1    32.0c           39.0W   300Mhz  1200Mhz  0%   auto  290.0W    0%   0%
-  2    33.0c           37.0W   300Mhz  1200Mhz  0%   auto  290.0W    0%   0%
-  ================================================================================
-  ============================= End of ROCm SMI Log ==============================
+  amd-smi
+
++------------------------------------------------------------------------------+
+| AMD-SMI          26.3.0+615aab95ed                                           |
+| amdgpu Version:  6.14.19                                                     |
+| ROCm Version:    7.3.0                                                       |
+| VBIOS Version:   020.001.000.060.000000                                      |
+| Platform:        Linux Baremetal                                             |
+|-------------------------------------+----------------------------------------|
+| BDF                        GPU-Name | Mem-Uti   Temp   UEC       Power-Usage |
+| GPU  HIP-ID  OAM-ID  Partition-Mode | GFX-Uti    Fan               Mem-Usage |
+|=====================================+========================================|
+| 0000:43:00.0  AMD Radeon RX 6800 XT | 0 %      32 °C   0            12/272 W |
+|   0       1     N/A             N/A | 0 %      0.0 %             16/16368 MB |
+|-------------------------------------+----------------------------------------|
+| 0000:63:00.0      Radeon RX 7900 XT | 0 %      41 °C   0            50/257 W |
+|   1       0     N/A             N/A | 4 %      0.0 %             26/20464 MB |
++-------------------------------------+----------------------------------------+
 
 Set Up the ROCm Docker Container and Conda Environment
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/fbgemm_gpu/docs/src/fbgemm_gpu/development/TestInstructions.rst b/fbgemm_gpu/docs/src/fbgemm_gpu/development/TestInstructions.rst
index f66038fdf9..1ac3bdcc7e 100644
--- a/fbgemm_gpu/docs/src/fbgemm_gpu/development/TestInstructions.rst
+++ b/fbgemm_gpu/docs/src/fbgemm_gpu/development/TestInstructions.rst
@@ -78,7 +78,7 @@ For ROCm machines, testing against a ROCm GPU needs to be enabled with
   # Specify the specific HIP devices to run the tests on
   #
   # NOTE: This is necessary if PyTorch is unable to see the devices that
-  # `rocm-smi --showproductname` can see
+  # `amd-smi static --asic` can see
   export HIP_VISIBLE_DEVICES=0,1,2,3
 
   # Enable for debugging kernel executions
diff --git a/fbgemm_gpu/src/topology_utils.cpp b/fbgemm_gpu/src/topology_utils.cpp
index 240caaf8cf..82d2936b00 100644
--- a/fbgemm_gpu/src/topology_utils.cpp
+++ b/fbgemm_gpu/src/topology_utils.cpp
@@ -15,32 +15,47 @@
 
 #ifdef USE_ROCM
 #include <inttypes.h>
+#include "amd_smi/amdsmi.h"
 #include "hip/hip_runtime.h"
-#include "rocm_smi/rocm_smi.h"
 
-#define RSMI_CHECK(fn)                          \
-  do {                                          \
-    rsmi_status_t ret = (fn);                   \
-    TORCH_CHECK_EQ((ret), RSMI_STATUS_SUCCESS); \
+#define AMDSMI_CHECK(fn)                              \
+  do {                                                \
+    amdsmi_status_t ret = (fn);                       \
+    TORCH_CHECK_EQ((ret), AMDSMI_STATUS_SUCCESS);     \
   } while (0)
 
-#define RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16
+#define AMDSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16
 
 namespace fbgemm_gpu {
 AdjacencyMatrix<Links> get_nvlink_matrix() {
   auto world_size = at::cuda::getNumGPUs();
-  RSMI_CHECK(rsmi_init(0));
+  AMDSMI_CHECK(amdsmi_init(AMDSMI_INIT_AMD_GPUS));
 
-  // Note that ROCm_SMI uses a different numbering method to ROCm runtime,
+  // Note that AMD SMI uses a different numbering method to ROCm runtime,
   // so we need to learn the mapping by using the bus ID.
-  uint32_t device_count;
-  RSMI_CHECK(rsmi_num_monitor_devices(&device_count));
 
-  std::unordered_map<Node, uint32_t> rocm_device_to_rsmi_device;
+  // Get all sockets, then collect all GPU processor handles across sockets.
+  uint32_t socket_count = 0;
+  AMDSMI_CHECK(amdsmi_get_socket_handles(&socket_count, nullptr));
+  std::vector<amdsmi_socket_handle> sockets(socket_count);
+  AMDSMI_CHECK(amdsmi_get_socket_handles(&socket_count, sockets.data()));
+
+  std::vector<amdsmi_processor_handle> processor_handles;
+  for (uint32_t s = 0; s < socket_count; s++) {
+    uint32_t device_count = 0;
+    AMDSMI_CHECK(amdsmi_get_processor_handles(sockets[s], &device_count, nullptr));
+    std::vector<amdsmi_processor_handle> socket_handles(device_count);
+    AMDSMI_CHECK(amdsmi_get_processor_handles(
+        sockets[s], &device_count, socket_handles.data()));
+    processor_handles.insert(
+        processor_handles.end(), socket_handles.begin(), socket_handles.end());
+  }
 
-  for (const auto i : c10::irange(device_count)) {
+  std::unordered_map<Node, amdsmi_processor_handle> hip_device_to_handle;
+
+  for (const auto& handle : processor_handles) {
     uint64_t pci_info;
-    RSMI_CHECK(rsmi_dev_pci_id_get(i, &pci_info));
+    AMDSMI_CHECK(amdsmi_get_gpu_bdf_id(handle, &pci_info));
     uint64_t domain, bus, device, function;
     domain = (pci_info >> 32) & 0xffffffff;
     bus = (pci_info >> 8) & 0xff;
@@ -48,7 +63,7 @@ AdjacencyMatrix<Links> get_nvlink_matrix() {
     function = pci_info & 0x7;
     // Different from CUDA, we do not get the PCI BUS ID as a char* and we need
     // to reconstruct it.
-    char pci_bus_id_str[RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+    char pci_bus_id_str[AMDSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
     sprintf(
         pci_bus_id_str,
         "%04" PRIu64 ":%02" PRIu64 ":%02" PRIu64 ".%0" PRIu64,
@@ -57,15 +72,15 @@ AdjacencyMatrix<Links> get_nvlink_matrix() {
         device,
         function);
 
-    std::array<char, RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id;
+    std::array<char, AMDSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id;
     std::copy(
         &pci_bus_id_str[0],
-        &pci_bus_id_str[RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE],
+        &pci_bus_id_str[AMDSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE],
         pci_bus_id.data());
     int32_t node = 0;
     auto err = hipDeviceGetByPCIBusId(&node, pci_bus_id.data());
     if (err == hipSuccess) {
-      rocm_device_to_rsmi_device.insert({node, i});
+      hip_device_to_handle.insert({node, handle});
     } else {
       // flush the last error - this can occur when e.g. we set
       // HIP_VISIBLE_DEVICES to a subset of the available GPUs in the system.
@@ -75,14 +90,14 @@ AdjacencyMatrix<Links> get_nvlink_matrix() {
 
   std::vector<Links> links(world_size * world_size);
   for (const auto i : c10::irange(world_size)) {
-    auto src_rsmi_device = rocm_device_to_rsmi_device.find(i);
-    if (src_rsmi_device != rocm_device_to_rsmi_device.end()) {
+    auto src = hip_device_to_handle.find(i);
+    if (src != hip_device_to_handle.end()) {
       for (const auto j : c10::irange(world_size)) {
-        auto dst_rsmi_device = rocm_device_to_rsmi_device.find(j);
-        if (dst_rsmi_device != rocm_device_to_rsmi_device.end()) {
+        auto dst = hip_device_to_handle.find(j);
+        if (dst != hip_device_to_handle.end()) {
           bool is_active;
-          RSMI_CHECK(rsmi_is_P2P_accessible(
-              src_rsmi_device->second, dst_rsmi_device->second, &is_active));
+          AMDSMI_CHECK(
+              amdsmi_is_P2P_accessible(src->second, dst->second, &is_active));
           if (is_active) {
             links[i * world_size + j] += 1;
           }
@@ -90,7 +105,7 @@ AdjacencyMatrix<Links> get_nvlink_matrix() {
       }
     }
   }
-  RSMI_CHECK(rsmi_shut_down());
+  AMDSMI_CHECK(amdsmi_shut_down());
   return [=](Node i, Node j) {
     TORCH_CHECK_LT(i, world_size);
     TORCH_CHECK_LT(j, world_size);