diff --git a/mlir/include/mlir/Dialect/Rock/IR/AmdArchDb.h b/mlir/include/mlir/Dialect/Rock/IR/AmdArchDb.h
index 8835dd96235e..e192d553a39c 100644
--- a/mlir/include/mlir/Dialect/Rock/IR/AmdArchDb.h
+++ b/mlir/include/mlir/Dialect/Rock/IR/AmdArchDb.h
@@ -33,6 +33,7 @@ struct AmdArchInfo {
   int64_t minNumCU;
   bool hasFp8ConversionInstrs;
   bool hasOcpFp8ConversionInstrs;
+  bool hasFp4;
   bool hasScaledGemm;
   int64_t maxNumXCC;
   bool hasLdsTransposeLoad;
@@ -42,14 +43,15 @@ struct AmdArchInfo {
                         int64_t totalVGPRPerEU, int64_t sharedMemPerCU,
                         int64_t sharedMemPerWG, int64_t numEUPerCU,
                         int64_t minNumCU, bool hasFp8ConversionInstrs,
-                        bool hasOcpFp8ConversionInstrs, bool hasScaledGemm,
-                        int64_t maxNumXCC, bool hasLdsTransposeLoad)
+                        bool hasOcpFp8ConversionInstrs, bool hasFp4,
+                        bool hasScaledGemm, int64_t maxNumXCC,
+                        bool hasLdsTransposeLoad)
       : defaultFeatures(defaultFeatures), waveSize(waveSize),
         maxWavesPerEU(maxWavesPerEU), totalSGPRPerEU(totalSGPRPerEU),
         totalVGPRPerEU(totalVGPRPerEU), totalSharedMemPerCU(sharedMemPerCU),
         maxSharedMemPerWG(sharedMemPerWG), numEUPerCU(numEUPerCU),
         minNumCU(minNumCU), hasFp8ConversionInstrs(hasFp8ConversionInstrs),
-        hasOcpFp8ConversionInstrs(hasOcpFp8ConversionInstrs),
+        hasOcpFp8ConversionInstrs(hasOcpFp8ConversionInstrs), hasFp4(hasFp4),
         hasScaledGemm(hasScaledGemm), maxNumXCC(maxNumXCC),
         hasLdsTransposeLoad(hasLdsTransposeLoad) {}
 
diff --git a/mlir/lib/Dialect/Rock/IR/AmdArchDb.cpp b/mlir/lib/Dialect/Rock/IR/AmdArchDb.cpp
index e222cba2831c..71c4b568215e 100644
--- a/mlir/lib/Dialect/Rock/IR/AmdArchDb.cpp
+++ b/mlir/lib/Dialect/Rock/IR/AmdArchDb.cpp
@@ -45,14 +45,16 @@ static constexpr AmdArchInfo
             /*totalVGPRPerEU*/ 256, /*totalSharedMemPerCU*/ 65536,
             /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/80,
             /*hasFp8ConversionInstrs=*/false,
-            /*hasOcpFp8ConversionInstrs=*/false, /*hasScaledGemm=*/false,
+            /*hasOcpFp8ConversionInstrs=*/false, /*hasFp4=*/false,
+            /*hasScaledGemm=*/false,
             /*maxNumXCC=*/1, /*hasLdsTransposeLoad=*/false),
     cdna50Info(GemmFeatures::dot, /*waveSize=*/64, /*maxWavesPerEU*/ 8,
                /*totalSGPRPerEU*/ 512, /*totalVGPRPerEU*/ 256,
                /*totalSharedMemPerCU*/ 65536, /*maxSharedMemPerWG*/ 65536,
                /*numEUPerCU=*/4, /*minNumCU=*/10,
                /*hasFp8ConversionInstrs=*/false,
-               /*hasOcpFp8ConversionInstrs=*/false, /*hasScaledGemm=*/false,
+               /*hasOcpFp8ConversionInstrs=*/false, /*hasFp4=*/false,
+               /*hasScaledGemm=*/false,
                /*maxNumXCC=*/1, /*hasLdsTransposeLoad=*/false),
     cdnaInfo(GemmFeatures::mfma | GemmFeatures::dot | GemmFeatures::atomic_add |
                  GemmFeatures::atomic_add_f16,
@@ -60,7 +62,8 @@ static constexpr AmdArchInfo
              /*totalVGPRPerEU*/ 256, /*totalSharedMemPerCU*/ 65536,
              /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/120,
              /*hasFp8ConversionInstrs=*/false,
-             /*hasOcpFp8ConversionInstrs=*/false, /*hasScaledGemm=*/false,
+             /*hasOcpFp8ConversionInstrs=*/false, /*hasFp4=*/false,
+             /*hasScaledGemm=*/false,
              /*maxNumXCC=*/1, /*hasLdsTransposeLoad=*/false),
     cdna2Info(GemmFeatures::mfma | GemmFeatures::dot |
                   GemmFeatures::atomic_add | GemmFeatures::atomic_add_f16,
@@ -68,7 +71,8 @@ static constexpr AmdArchInfo
               /*totalVGPRPerEU*/ 512, /*totalSharedMemPerCU*/ 65536,
               /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/104,
               /*hasFp8ConversionInstrs=*/false,
-              /*hasOcpFp8ConversionInstrs=*/false, /*hasScaledGemm=*/false,
+              /*hasOcpFp8ConversionInstrs=*/false, /*hasFp4=*/false,
+              /*hasScaledGemm=*/false,
               /*maxNumXCC=*/1, /*hasLdsTransposeLoad=*/false),
     cdna3Info(GemmFeatures::mfma | GemmFeatures::dot |
                   GemmFeatures::atomic_add | GemmFeatures::atomic_add_f16 |
@@ -77,7 +81,8 @@ static constexpr AmdArchInfo
               /*totalVGPRPerEU*/ 512, /*totalSharedMemPerCU*/ 65536,
               /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/20,
               /*hasFp8ConversionInstrs=*/true,
-              /*hasOcpFp8ConversionInstrs=*/false, /*hasScaledGemm=*/false,
+              /*hasOcpFp8ConversionInstrs=*/false, /*hasFp4=*/false,
+              /*hasScaledGemm=*/false,
               /*maxNumXCC=*/8, /*hasLdsTransposeLoad=*/false),
     cdna40Info(GemmFeatures::mfma | GemmFeatures::dot |
                    GemmFeatures::atomic_add | GemmFeatures::atomic_add_f16 |
@@ -88,7 +93,8 @@ static constexpr AmdArchInfo
                /*totalVGPRPerEU*/ 512, /*totalSharedMemPerCU*/ 163840,
                /*maxSharedMemPerWG*/ 163840, /*numEUPerCU=*/4, /*minNumCU=*/256,
                /*hasFp8ConversionInstrs=*/false,
-               /*hasOcpFp8ConversionInstrs=*/true, /*hasScaledGemm=*/true,
+               /*hasOcpFp8ConversionInstrs=*/true, /*hasFp4=*/true,
+               /*hasScaledGemm=*/true,
                /*maxNumXCC=*/8, /*hasLdsTransposeLoad=*/true),
     // amdgpu target builds all RDNA in WGP Mode
     rdnaNoDotInfo(GemmFeatures::atomic_fmax_f32, /*waveSize=*/32,
@@ -97,14 +103,16 @@ static constexpr AmdArchInfo
                   /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4,
                   /*minNumCU=*/30,
                   /*hasFp8ConversionInstrs=*/false,
-                  /*hasOcpFp8ConversionInstrs=*/false, /*hasScaledGemm=*/false,
+                  /*hasOcpFp8ConversionInstrs=*/false, /*hasFp4=*/false,
+                  /*hasScaledGemm=*/false,
                   /*maxNumXCC=*/1, /*hasLdsTransposeLoad=*/false),
     rdnaInfo(GemmFeatures::dot | GemmFeatures::atomic_fmax_f32,
              /*waveSize=*/32, /*maxWavesPerEU*/ 16, /*totalSGPRPerEU*/ 512,
              /*totalVGPRPerEU*/ 1024, /*totalSharedMemPerCU*/ 131072,
              /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/2,
              /*hasFp8ConversionInstrs=*/false,
-             /*hasOcpFp8ConversionInstrs=*/false, /*hasScaledGemm=*/false,
+             /*hasOcpFp8ConversionInstrs=*/false, /*hasFp4=*/false,
+             /*hasScaledGemm=*/false,
              /*maxNumXCC=*/1, /*hasLdsTransposeLoad=*/false),
     rdna3Info(GemmFeatures::dot | GemmFeatures::atomic_add |
                   GemmFeatures::atomic_fmax_f32 | GemmFeatures::wmma,
@@ -112,7 +120,8 @@ static constexpr AmdArchInfo
               /*totalVGPRPerEU*/ 1536, /*totalSharedMemPerCU*/ 131072,
               /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/2,
               /*hasFp8ConversionInstrs=*/false,
-              /*hasOcpFp8ConversionInstrs=*/false, /*hasScaledGemm=*/false,
+              /*hasOcpFp8ConversionInstrs=*/false, /*hasFp4=*/false,
+              /*hasScaledGemm=*/false,
               /*maxNumXCC=*/1, /*hasLdsTransposeLoad=*/false),
     rdna4Info(GemmFeatures::dot | GemmFeatures::atomic_add |
                   GemmFeatures::atomic_fmax_f32 | GemmFeatures::wmma |
@@ -121,7 +130,8 @@ static constexpr AmdArchInfo
               /*totalVGPRPerEU*/ 1536, /*totalSharedMemPerCU*/ 131072,
               /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/12,
               /*hasFp8ConversionInstrs=*/false,
-              /*hasOcpFp8ConversionInstrs=*/true, /*hasScaledGemm=*/false,
+              /*hasOcpFp8ConversionInstrs=*/true, /*hasFp4=*/false,
+              /*hasScaledGemm=*/false,
               /*maxNumXCC=*/1, /*hasLdsTransposeLoad=*/false),
     // TODO: update with right information
     gfx1250Info(GemmFeatures::dot | GemmFeatures::atomic_add |
@@ -132,7 +142,8 @@ static constexpr AmdArchInfo
                 /*totalVGPRPerEU*/ 1536, /*totalSharedMemPerCU*/ 131072,
                 /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/12,
                 /*hasFp8ConversionInstrs=*/false,
-                /*hasOcpFp8ConversionInstrs=*/true, /*hasScaledGemm=*/false,
+                /*hasOcpFp8ConversionInstrs=*/true, /*hasFp4=*/false,
+                /*hasScaledGemm=*/false,
                 /*maxNumXCC=*/1, /*hasLdsTransposeLoad=*/false);
 
 static std::tuple<StringRef, unsigned> parseArchString(StringRef arch) {
@@ -360,8 +371,6 @@ AmdArchInfo nativeArchInfo(unsigned deviceId = 0) {
 #endif // !_WIN32 && ROCMLIR_ENABLE_NATIVE_ARCH
 
 AmdArchInfo mlir::rock::lookupArchInfo(StringRef arch) {
-  // Keep this implementation in sync with
-  // mlir/test/lit.site.cfg.py.in:set_arch_features()
   auto [chip, deviceId] = parseArchString(arch);
   if (chip == "native") {
 #if !defined(_WIN32) && defined(ROCMLIR_ENABLE_NATIVE_ARCH)
diff --git a/mlir/lib/Dialect/Rock/utility/Bindings/AmdArchDbBindings.cpp b/mlir/lib/Dialect/Rock/utility/Bindings/AmdArchDbBindings.cpp
index ada137be7a3a..e9909ae2894e 100644
--- a/mlir/lib/Dialect/Rock/utility/Bindings/AmdArchDbBindings.cpp
+++ b/mlir/lib/Dialect/Rock/utility/Bindings/AmdArchDbBindings.cpp
@@ -10,13 +10,14 @@
 #include <pybind11/pybind11.h>
 
 #include "mlir/Dialect/Rock/IR/AmdArchDb.h"
+#include "llvm/ADT/StringRef.h"
 
 namespace py = pybind11;
 
 PYBIND11_MODULE(amd_arch_db, m) {
   m.doc() = "Database of AMD GPU features";
 
-  py::enum_<mlir::rock::GemmFeatures>(m, "GemmFeatures")
+  py::enum_<mlir::rock::GemmFeatures>(m, "GemmFeatures", py::arithmetic())
       .value("NONE", mlir::rock::GemmFeatures::none)
       .value("MFMA", mlir::rock::GemmFeatures::mfma)
       .value("WMMA", mlir::rock::GemmFeatures::wmma)
@@ -24,7 +25,10 @@ PYBIND11_MODULE(amd_arch_db, m) {
       .value("ATOMIC_ADD", mlir::rock::GemmFeatures::atomic_add)
       .value("ATOMIC_ADD_BF16", mlir::rock::GemmFeatures::atomic_add_bf16)
       .value("ATOMIC_ADD_F16", mlir::rock::GemmFeatures::atomic_add_f16)
-      .value("ATOMIC_FMAX_F32", mlir::rock::GemmFeatures::atomic_fmax_f32);
+      .value("ATOMIC_FMAX_F32", mlir::rock::GemmFeatures::atomic_fmax_f32)
+      .value("DIRECT_TO_LDS_32B", mlir::rock::GemmFeatures::direct_to_lds_32b)
+      .value("DIRECT_TO_LDS_128B",
+             mlir::rock::GemmFeatures::direct_to_lds_128b);
 
   py::class_<mlir::rock::AmdArchInfo>(m, "AmdArchInfo")
       .def_readonly("default_features",
@@ -45,12 +49,31 @@ PYBIND11_MODULE(amd_arch_db, m) {
                     &mlir::rock::AmdArchInfo::hasFp8ConversionInstrs)
       .def_readonly("has_ocp_fp8_conversion_instrs",
                     &mlir::rock::AmdArchInfo::hasOcpFp8ConversionInstrs)
+      .def_readonly("has_fp4", &mlir::rock::AmdArchInfo::hasFp4)
       .def_readonly("has_scaled_gemm", &mlir::rock::AmdArchInfo::hasScaledGemm)
       .def_readonly("max_num_xcc", &mlir::rock::AmdArchInfo::maxNumXCC)
       .def_readonly("has_lds_transpose_load",
                     &mlir::rock::AmdArchInfo::hasLdsTransposeLoad);
 
+  m.def(
+      "has_feature",
+      [](mlir::rock::GemmFeatures features, mlir::rock::GemmFeatures flag) {
+        return bitEnumContainsAny(features, flag);
+      },
+      "Return True if any bit set in `flag` is also set in `features`. "
+      "Matches `bool(int(features) & int(flag))`.");
+
   m.def("lookup_arch_info", [](const std::string &arch) {
+  // The "native:<deviceId>" code path in lookupArchInfo requires the build to
+  // have been configured with ROCMLIR_ENABLE_NATIVE_ARCH=ON. Without it the
+  // underlying call hits an llvm_unreachable, which would abort the Python
+  // interpreter; raise a Python-level error instead.
+#ifndef ROCMLIR_ENABLE_NATIVE_ARCH
+    if (llvm::StringRef(arch).starts_with("native"))
+      throw py::value_error(
+          "\"native\" arch lookup is not available in this build "
+          "(requires ROCMLIR_ENABLE_NATIVE_ARCH=ON)");
+#endif
     return mlir::rock::lookupArchInfo(arch);
   });
 }
diff --git a/mlir/lib/Dialect/Rock/utility/Bindings/CMakeLists.txt b/mlir/lib/Dialect/Rock/utility/Bindings/CMakeLists.txt
index efc13d07ff5e..cf8be7ae29c3 100644
--- a/mlir/lib/Dialect/Rock/utility/Bindings/CMakeLists.txt
+++ b/mlir/lib/Dialect/Rock/utility/Bindings/CMakeLists.txt
@@ -27,4 +27,12 @@ if(NOT WIN32)
 
     pybind11_add_module(amd_arch_db AmdArchDbBindings.cpp)
     target_link_libraries(amd_arch_db PUBLIC MLIRRockUtility)
+    if(ROCMLIR_ENABLE_NATIVE_ARCH)
+        target_compile_definitions(amd_arch_db PRIVATE ROCMLIR_ENABLE_NATIVE_ARCH=1)
+    endif()
+    set_target_properties(amd_arch_db PROPERTIES
+        LIBRARY_OUTPUT_DIRECTORY "${ROCMLIR_BIN_DIR}")
+    message(VERBOSE "amd_arch_db Python binding will be built in ${ROCMLIR_BIN_DIR}")
+    message(VERBOSE "To use it outside the build bin directory, set: "
+                    "export PYTHONPATH=${ROCMLIR_BIN_DIR}:\$PYTHONPATH")
 endif()
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index c2852d8720a3..a11fbf077298 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -110,6 +110,10 @@ list(APPEND ROCMLIR_TEST_DEPENDS
   rocmlir-common-python-test-utils
 )
 
+if (TARGET amd_arch_db)
+  list(APPEND ROCMLIR_TEST_DEPENDS amd_arch_db)
+endif()
+
 if(MLIR_ENABLE_ROCM_RUNNER)
   list(APPEND ROCMLIR_TEST_DEPENDS
     mlir_runner_utils
diff --git a/mlir/test/common_utils/common.py b/mlir/test/common_utils/common.py
index 60c70de500f1..8833291327cd 100644
--- a/mlir/test/common_utils/common.py
+++ b/mlir/test/common_utils/common.py
@@ -1,50 +1,31 @@
 from hip import hip
+from amd_arch_db import GemmFeatures, has_feature, lookup_arch_info
 
 
-# Helper function to decode arch to its features
-# Keep this in sync with mlir/lib/Dialect/Rock/Generator/AmdArchDb.cpp:mlir::rock::lookupArchInfo
-def get_arch_features(arch: str):
-    chip_name = arch.split(':')[0]
-    if len(chip_name) < 5:
-        return
+def features_to_string(features):
+    val = int(features)
+    if val == 0:
+        return 'none'
+    # Iteration follows the .value(...) chain in
+    # mlir/lib/Dialect/Rock/utility/Bindings/AmdArchDbBindings.cpp, which is
+    # kept in sync with the bit positions in RockAttrDefs.td. Do not reorder
+    # without updating the bindings; lit tests match on this exact spelling.
+    names = []
+    for name, member in GemmFeatures.__members__.items():
+        bit = int(member)
+        if bit and (val & bit):
+            names.append(name.lower())
+    return '|'.join(names)
+
 
-    arch_features = None
-    support_mfma = False
-    support_wmma = False
-    support_accel_fp8 = False
-    major = chip_name[:-2]
-    minor = chip_name[-2:]
-    if major == 'gfx9':
-        if minor in ['08', '0a']:
-            arch_features = 'mfma|dot|atomic_add|atomic_add_f16'
-        elif minor == '42':
-            arch_features = 'mfma|dot|atomic_add|atomic_add_f16|direct_to_lds_32b'
-            support_accel_fp8 = True
-        elif minor == '50':
-            arch_features = 'mfma|dot|atomic_add|atomic_add_f16|atomic_add_bf16|direct_to_lds_32b|direct_to_lds_128b|lds_transpose_load'
-            support_accel_fp8 = True
-        elif minor == '06':
-            arch_features = 'dot'
-        else:
-            arch_features = 'none'
-    elif major == 'gfx10':
-        if minor in ['11', '13']:
-            arch_features = 'atomic_fmax_f32'
-        elif minor in ['10', '12'] or minor[0] == '3':
-            arch_features = 'dot|atomic_fmax_f32'
-        else:
-            arch_features = 'atomic_fmax_f32'
-    elif major == 'gfx11':
-        arch_features = 'dot|atomic_add|atomic_fmax_f32|wmma'
-    elif major == 'gfx12':
-        arch_features = 'dot|atomic_add|atomic_add_f16|atomic_add_bf16|atomic_fmax_f32|wmma'
-        support_accel_fp8 = True
-    if arch_features and 'mfma' in arch_features:
-        support_mfma = True
-        pass
-    elif arch_features and 'wmma' in arch_features:
-        support_wmma = True
-        pass
+def get_arch_features(arch: str):
+    info = lookup_arch_info(arch)
+    arch_features = features_to_string(info.default_features)
+    if info.has_lds_transpose_load:
+        arch_features += '|lds_transpose_load'
+    support_mfma = has_feature(info.default_features, GemmFeatures.MFMA)
+    support_wmma = has_feature(info.default_features, GemmFeatures.WMMA)
+    support_accel_fp8 = info.has_fp8_conversion_instrs or info.has_ocp_fp8_conversion_instrs
     return arch_features, support_mfma, support_wmma, support_accel_fp8
 
 
@@ -82,4 +63,6 @@ def get_default_agent():
 
 def is_xdlops_present() -> bool:
     """This function checks whether a GPU with xdlops support is present"""
-    return any([agent.startswith("gfx9") for agent in get_agents()])
+    return any(
+        has_feature(lookup_arch_info(agent).default_features, GemmFeatures.MFMA)
+        for agent in get_agents())
diff --git a/mlir/test/e2e/lit.site.cfg.py.in b/mlir/test/e2e/lit.site.cfg.py.in
index c0ebd0c6f128..0e1aead518e0 100644
--- a/mlir/test/e2e/lit.site.cfg.py.in
+++ b/mlir/test/e2e/lit.site.cfg.py.in
@@ -28,6 +28,7 @@ config.rocmlir_common_python_tests_utils = "@ROCMLIR_COMMON_PYTHON_TESTS_UTILS@"
 
 # Add common python test utils
 sys.path.append(config.rocmlir_common_python_tests_utils)
+sys.path.append(config.mlir_rock_tools_dir)
 from common import get_agents, get_arch_features, get_default_agent
 
 # Support substitution of the tools_dir with user parameters. This is
diff --git a/mlir/test/fusion/e2e/lit.site.cfg.py.in b/mlir/test/fusion/e2e/lit.site.cfg.py.in
index d0a38d7f3c81..278cb041b42b 100644
--- a/mlir/test/fusion/e2e/lit.site.cfg.py.in
+++ b/mlir/test/fusion/e2e/lit.site.cfg.py.in
@@ -35,7 +35,8 @@ config.rocmlir_common_python_tests_utils = "@ROCMLIR_COMMON_PYTHON_TESTS_UTILS@"
 
 # Add common python test utils
 sys.path.append(config.rocmlir_common_python_tests_utils)
-from common import get_agents, get_default_agent
+sys.path.append(config.mlir_rock_tools_dir)
+from common import get_agents, get_arch_features, get_default_agent
 
 # Support substitution of the tools_dir with user parameters. This is
 # used when we can't determine the tool dir at configuration time.
@@ -78,11 +79,7 @@ if config.rocm_path:
                                     "HIP_VISIBLE_DEVICES will be set to '0' to ensure binary compatibility."
                                     % (', '.join(sorted(agents)), default_agent))
             config.arch = default_agent
-            # Check features for the device we'll actually use
-            if any([arch in default_agent for arch in ["gfx908", "gfx90a", "gfx942", "gfx950"]]):
-                config.arch_support_mfma = True
-            elif "gfx11" in default_agent or "gfx12" in default_agent:
-                config.arch_support_wmma = True
+            _, config.arch_support_mfma, config.arch_support_wmma, _ = get_arch_features(default_agent)
         if not config.arch:
             config.no_AMD_GPU = True
     except subprocess.CalledProcessError:
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index fa8e08a52393..fb6443ea2853 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -59,6 +59,7 @@ config.rocmlir_common_python_tests_utils = "@ROCMLIR_COMMON_PYTHON_TESTS_UTILS@"
 
 # Add common python test utils
 sys.path.append(config.rocmlir_common_python_tests_utils)
+sys.path.append(config.mlir_rock_tools_dir)
 from common import get_agents, get_arch_features, get_default_agent
 
 # Support substitution of the tools_dir with user parameters. This is
diff --git a/mlir/unittests/Dialect/Rock/AmdArchDbTests.cpp b/mlir/unittests/Dialect/Rock/AmdArchDbTests.cpp
index c30ac8f152b4..f0f04b0c32ed 100644
--- a/mlir/unittests/Dialect/Rock/AmdArchDbTests.cpp
+++ b/mlir/unittests/Dialect/Rock/AmdArchDbTests.cpp
@@ -57,7 +57,10 @@ TEST_P(NativeArchTest, NativeArchInfoMatchesPresetInfo) {
             nativeInfo.hasFp8ConversionInstrs);
   EXPECT_EQ(presetInfo.hasOcpFp8ConversionInstrs,
             nativeInfo.hasOcpFp8ConversionInstrs);
+  EXPECT_EQ(presetInfo.hasFp4, nativeInfo.hasFp4);
+  EXPECT_EQ(presetInfo.hasScaledGemm, nativeInfo.hasScaledGemm);
   EXPECT_GE(presetInfo.maxNumXCC, nativeInfo.maxNumXCC);
+  EXPECT_EQ(presetInfo.hasLdsTransposeLoad, nativeInfo.hasLdsTransposeLoad);
 }
 
 INSTANTIATE_TEST_SUITE_P(NativeArchTests, NativeArchTest,
diff --git a/mlir/utils/performance/CMakeLists.txt b/mlir/utils/performance/CMakeLists.txt
index 2b19d483b0b2..c28085f74d58 100644
--- a/mlir/utils/performance/CMakeLists.txt
+++ b/mlir/utils/performance/CMakeLists.txt
@@ -25,3 +25,7 @@ list(TRANSFORM PERFORMANCE_SCRIPTS PREPEND "${CMAKE_CURRENT_SOURCE_DIR}/")
 
 add_custom_target(ci-performance-scripts
 	COMMAND ${CMAKE_COMMAND} -E copy ${PERFORMANCE_SCRIPTS} ${ROCMLIR_BIN_DIR})
+
+if (TARGET amd_arch_db)
+  add_dependencies(ci-performance-scripts amd_arch_db)
+endif()
diff --git a/mlir/utils/performance/analysis/quickTuningGen.py b/mlir/utils/performance/analysis/quickTuningGen.py
index ed0b745363a5..dc87e8df39a4 100644
--- a/mlir/utils/performance/analysis/quickTuningGen.py
+++ b/mlir/utils/performance/analysis/quickTuningGen.py
@@ -14,6 +14,8 @@
 import pandas as pd
 import pulp
 
+from amd_arch_db import GemmFeatures, has_feature, lookup_arch_info
+
 # Column definitions for grouping problems
 GEMM_COLUMNS = ['TransA', 'TransB', 'G', 'M', 'K', 'N']
 CONV_COLUMNS = [
@@ -37,9 +39,10 @@ def get_instruction_type(arch, dtype, op):
     """Determine instruction type based on architecture, data type, and operation."""
     if op == "attention":
         return "GemmGemm"
-    if arch.startswith("gfx9"):
+    features = lookup_arch_info(arch).default_features
+    if has_feature(features, GemmFeatures.MFMA):
         return "XDL"
-    elif arch.startswith("gfx1") and dtype != "f32":
+    if has_feature(features, GemmFeatures.WMMA) and dtype != "f32":
         return "Wmma"
     return "NonAccel"
 
diff --git a/mlir/utils/performance/analysis/testing-metrics.py b/mlir/utils/performance/analysis/testing-metrics.py
index d4ee7780bb88..03587a7d016d 100644
--- a/mlir/utils/performance/analysis/testing-metrics.py
+++ b/mlir/utils/performance/analysis/testing-metrics.py
@@ -17,33 +17,21 @@
 import argparse
 import math
 import os
-from hip import hip
 
-# TODO use AmdArchDb.py (when it's implemented)
+from amd_arch_db import lookup_arch_info
 
-num_eu_per_cu = 4  # may be changed in newer architectures
 
-
-def hip_check(call_result):
-    err = call_result[0]
-    result = call_result[1:]
-    if len(result) == 1:
-        result = result[0]
-    if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess:
-        raise RuntimeError(str(err))
-    return result
+def get_num_eu_per_cu():
+    return lookup_arch_info("native:0").num_eu_per_cu
 
 
 def assign_num_cu():
     if args.c:
         return int(args.c)
-    else:
-        props = hip.hipDeviceProp_t()
-        hip_check(hip.hipGetDeviceProperties(props, 0))
-        print(
-            "Using info from GPU 0 in your system, the data should have be obtained from the same GPU."
-        )
-        return int(props.multiProcessorCount)
+    print(
+        "Using info from GPU 0 in your system, the data should have been obtained from the same GPU."
+    )
+    return lookup_arch_info("native:0").min_num_cu
 
 
 def analyze_gemm_file(file, n):
@@ -244,7 +232,7 @@ def determine_file_type(file):
     args = parser.parse_args()
 
     num_cus = assign_num_cu()
-    min_num_waves = num_cus * num_eu_per_cu
+    min_num_waves = num_cus * get_num_eu_per_cu()
 
     row_list = []
 
diff --git a/mlir/utils/performance/attentionSweeps.py b/mlir/utils/performance/attentionSweeps.py
index 914c0054c08c..7f31223c8ce8 100755
--- a/mlir/utils/performance/attentionSweeps.py
+++ b/mlir/utils/performance/attentionSweeps.py
@@ -35,9 +35,9 @@
     Options,
     sweep_parameters,
     multiline_repr,
-    infer_codegen_flags_from_arch,
     get_codegen_flags_for_codepath,
 )
+from amd_arch_db import GemmFeatures, has_feature, lookup_arch_info
 
 # GLOBAL VARIABLES
 DATA_TYPES_ATTENTION = initialize_dtypes_attn()
@@ -206,13 +206,13 @@ def _infer_instruction_set(arch: str, requested: str) -> str:
     if requested in ('mfma', 'wmma'):
         return requested
 
-    codepath, _ = infer_codegen_flags_from_arch(arch)
-    if codepath == 'unknown':
-        raise RuntimeError(f"Unknown arch for attention sweep: {arch}")
-    if codepath == 'vanilla':
-        raise RuntimeError(f"Unsupported attention codepath '{codepath}' for arch {arch}. "
-                           "Attention sweep requires MFMA or WMMA.")
-    return codepath
+    features = lookup_arch_info(arch).default_features
+    if has_feature(features, GemmFeatures.MFMA):
+        return 'mfma'
+    if has_feature(features, GemmFeatures.WMMA):
+        return 'wmma'
+    raise RuntimeError(f"Unsupported arch for attention sweep: {arch}. "
+                       "Attention sweep requires MFMA or WMMA.")
 
 
 def _resolve_codegen_flags(arch: str, instruction_set: str) -> list[str]:
@@ -272,7 +272,6 @@ def log_failing_configs(configs: List[AttentionConfiguration], filename: str):
 
 
 def run_attention_sweep(args, options, paths, chip):
-    # TODO: use AmdArchDb python version when available
     try:
         instruction_set = _infer_instruction_set(options.arch, args.codepath)
     except RuntimeError as e:
@@ -354,7 +353,7 @@ def main():
     if chip_match is None:
         raise RuntimeError(f"Could not find GFX chip in arch string: {arch}")
     chip = chip_match.group(0)
-    num_cu = get_num_cu(chip)
+    num_cu = get_num_cu()
     paths = create_paths(None, args.mlir_build_dir)
     options = Options(debug_fails=args.debug_fails,
                       debug=args.debug,
@@ -363,7 +362,7 @@ def main():
                       flags=[],
                       concurrent_tests=args.jobs,
                       num_cu=num_cu,
-                      num_chiplets=get_num_chiplets(chip, num_cu),
+                      num_chiplets=get_num_chiplets(),
                       log_failures=args.log_failures,
                       test_timeout_sec=args.test_timeout_sec)
 
diff --git a/mlir/utils/performance/parameterSweeps.py b/mlir/utils/performance/parameterSweeps.py
index d9776063cf3f..2f265ccd2c55 100755
--- a/mlir/utils/performance/parameterSweeps.py
+++ b/mlir/utils/performance/parameterSweeps.py
@@ -27,6 +27,8 @@
 from perfRunner import get_num_cu
 from perfRunner import get_num_chiplets
 
+from amd_arch_db import GemmFeatures, has_feature, lookup_arch_info
+
 
 @dataclass(frozen=True)
 class Options:
@@ -62,16 +64,16 @@ async def _communicate_with_timeout(proc: asyncio.subprocess.Process,
 
 def get_codegen_flags_for_codepath(arch: str, codepath: str) -> list[str]:
     """Returns rocmlir-gen feature flags for a given codepath and architecture."""
+    features = lookup_arch_info(arch).default_features
+
     if codepath == 'mfma':
         flags = ['-mfma=on', '-dot=on', '-atomic_add=on', '-atomic_add_f16=on']
-        if 'gfx942' in arch:
+        if has_feature(features, GemmFeatures.ATOMIC_ADD_BF16):
+            flags.append('-atomic_add_bf16=on')
+        if has_feature(features, GemmFeatures.DIRECT_TO_LDS_32B):
             flags.append('-direct_to_lds_32b=on')
-        elif 'gfx95' in arch:
-            flags.extend([
-                '-atomic_add_bf16=on',
-                '-direct_to_lds_32b=on',
-                '-direct_to_lds_128b=on',
-            ])
+        if has_feature(features, GemmFeatures.DIRECT_TO_LDS_128B):
+            flags.append('-direct_to_lds_128b=on')
         return flags
 
     if codepath == 'vanilla':
@@ -79,8 +81,10 @@ def get_codegen_flags_for_codepath(arch: str, codepath: str) -> list[str]:
 
     if codepath == 'wmma':
         flags = ['-mfma=off', '-dot=on', '-atomic_add=on', '-wmma=infer']
-        if 'gfx12' in arch:
-            flags.extend(['-atomic_add_f16=on', '-atomic_add_bf16=on'])
+        if has_feature(features, GemmFeatures.ATOMIC_ADD_F16):
+            flags.append('-atomic_add_f16=on')
+        if has_feature(features, GemmFeatures.ATOMIC_ADD_BF16):
+            flags.append('-atomic_add_bf16=on')
         return flags
 
     return []
@@ -94,30 +98,20 @@ def infer_codegen_flags_from_arch(arch: str,
     rely on rocmlir-gen arch auto-detection and return no explicit feature
     flags; flags are only emitted when a codepath override is explicitly
     requested.
-
-    Returns ('unknown', []) when inference fails.
     """
     supported_codepath = ['mfma', 'vanilla', 'wmma']
     codepath = requested_codepath
 
     if codepath not in supported_codepath:
-        if 'gfx908' in arch or 'gfx90a' in arch:
-            codepath = 'mfma'
-        elif 'gfx942' in arch:
-            codepath = 'mfma'
-        elif 'gfx95' in arch:
+        features = lookup_arch_info(arch).default_features
+        if int(features) == 0:
+            return ('unknown', [])
+        elif has_feature(features, GemmFeatures.MFMA):
             codepath = 'mfma'
-        elif 'gfx906' in arch:
-            codepath = 'vanilla'
-        elif 'gfx1030' in arch:
-            # Use vanilla codepath for gfx1030 until it has its own perf configs.
-            codepath = 'vanilla'
-        elif 'gfx11' in arch:
-            codepath = 'wmma'
-        elif 'gfx12' in arch:
+        elif has_feature(features, GemmFeatures.WMMA):
             codepath = 'wmma'
         else:
-            return ('unknown', [])
+            codepath = 'vanilla'
 
     if requested_codepath in supported_codepath:
         return (codepath, get_codegen_flags_for_codepath(arch, codepath))
@@ -741,8 +735,7 @@ def main() -> bool:
         # For non-perf-config sweeps, let rocmlir-gen infer features from --arch.
         rocmlir_gen_flags = []
 
-    chip = perfRunner.get_chip()
-    num_cu = get_num_cu(chip)
+    num_cu = get_num_cu()
     options = Options(debug=args.debug,
                       quiet=args.quiet,
                       log_failures=args.log_failures,
@@ -751,7 +744,7 @@ def main() -> bool:
                       flags=rocmlir_gen_flags,
                       concurrent_tests=args.jobs,
                       num_cu=num_cu,
-                      num_chiplets=get_num_chiplets(chip, num_cu),
+                      num_chiplets=get_num_chiplets(),
                       test_timeout_sec=args.test_timeout_sec)
 
     paths = perfRunner.create_paths(None, args.mlir_build_dir)
diff --git a/mlir/utils/performance/perfRegressionReport.py b/mlir/utils/performance/perfRegressionReport.py
index 5a89a0d7b657..7a49b9a16331 100644
--- a/mlir/utils/performance/perfRegressionReport.py
+++ b/mlir/utils/performance/perfRegressionReport.py
@@ -18,12 +18,13 @@ def load_mlir_data(filename: str):
         'LDSBankConflict (MIOpen)', 'LDSBankConflict (hipBLASLt)'
     ]
     df.drop(columns=columns_dropped, inplace=True, errors='ignore')
-    # Work around empty PerfConfig field whin migrating from no tuning to yes tuning
+    # Work around empty PerfConfig field when migrating from no tuning to yes tuning
     # Can be removed next time we touch this
     if 'PerfConfig' in df:
         df['PerfConfig'] = df['PerfConfig'].fillna('None')
     if 'numCU' not in df:
-        df.insert(4, 'numCU', get_num_cu(df['Chip'][0]))
+        # numCU is sourced from device 0 of the local host
+        df.insert(4, 'numCU', get_num_cu())
     return df
 
 
diff --git a/mlir/utils/performance/perfRunner.py b/mlir/utils/performance/perfRunner.py
index 8f1add09c46e..fe9e5d6e38fd 100644
--- a/mlir/utils/performance/perfRunner.py
+++ b/mlir/utils/performance/perfRunner.py
@@ -20,7 +20,10 @@
 import pandas as pd
 from hip import hip
 
+from amd_arch_db import GemmFeatures, has_feature, lookup_arch_info
+
 import reportUtils
+
 from perfCommonUtils import Operation, GEMMLibrary
 
 # global variables.
@@ -64,8 +67,6 @@
 ELAPSED_TIME_RE = re.compile(r"Elapsed: ([0-9\.]*) ms")
 # Compiled regexp object used for extracting target chip from arch
 GFX_CHIP_RE = re.compile(r"gfx[0-9a-z]+")
-INFO_ARCH_NAME = re.compile(r"Name:\s*(.*)")
-INFO_ARCH_CU = re.compile(r"Compute Unit:\s*(.*)")
 
 
 def input_layouts(input_layout):
@@ -188,12 +189,25 @@ def get_chip():
     return chip
 
 
+def chip_has_fp8():
+    info = lookup_arch_info(get_chip())
+    return info.has_fp8_conversion_instrs or info.has_ocp_fp8_conversion_instrs
+
+
+def chip_has_fp4():
+    return lookup_arch_info(get_chip()).has_fp4
+
+
+def chip_has_mfma():
+    return has_feature(lookup_arch_info(get_chip()).default_features, GemmFeatures.MFMA)
+
+
 DATA_TYPES_ATTENTION = None
 
 
 def initialize_dtypes_attn():
     global DATA_TYPES_ATTENTION
-    if get_chip().startswith('gfx9'):
+    if chip_has_mfma():
         DATA_TYPES_ATTENTION = DATA_TYPES_ATTENTION_MFMA
     else:
         DATA_TYPES_ATTENTION = DATA_TYPES_ATTENTION_WMMA
@@ -480,8 +494,7 @@ def get_conv_configurations(filename, arch, num_cu, num_chiplets):
 
                 # Skip unsupported datatypes
                 if datatype == 'convfp8':
-                    unsupported_chips = {'gfx908', 'gfx90a', 'gfx1030', 'gfx1101'}
-                    if get_chip() in unsupported_chips:
+                    if not chip_has_fp8():
                         continue
 
                 # Skip int8 non-fwd convolutions
@@ -790,14 +803,11 @@ def get_gemm_configurations(filename,
 
                 # Skip unsupported datatypes
                 if datatype == 'f4E2M1FN':
-                    # TODO: use information from AMDArchDB when it becomes available to determine supported chips
-                    supported_chips = {'gfx950'}
-                    if get_chip() not in supported_chips:
+                    if not chip_has_fp4():
                         continue
 
                 if datatype == 'fp8':
-                    unsupported_chips = {'gfx908', 'gfx90a', 'gfx1030', 'gfx1101'}
-                    if get_chip() in unsupported_chips:
+                    if not chip_has_fp8():
                         continue
 
                 # We need trailing spaces here to account for the concat below
@@ -2296,39 +2306,14 @@ def parse_data_types(data_types):
     return datatypes, out_map
 
 
-def get_num_chiplets(chip, num_cu):
-    # TODO: use AmdArchDb python bindings
-    if "gfx942" in chip and num_cu == 304:
-        return 8
-    if "gfx942" in chip and num_cu == 80:
-        return 4
-    if "gfx950" in chip:
-        return 8
+def get_num_chiplets(device_id: int = 0):
+    # In native mode, max_num_xcc contains the actual number of chiplets instead of the maximum
+    return lookup_arch_info(f"native:{device_id}").max_num_xcc
 
-    return 1
 
-
-def get_num_cu(chip):
-    try:
-        rocminfo = subprocess.check_output("/opt/rocm/bin/rocminfo", stderr=subprocess.PIPE)
-    except subprocess.CalledProcessError as e:
-        print(e.stderr.decode('utf-8'))
-        raise
-    except Exception as e:
-        print(f"Exception: {e}")
-        raise
-    rocminfo_lines = rocminfo.decode("utf-8").split("\n")
-    found_chip = False
-    for line in rocminfo_lines:
-        if not found_chip:
-            m = INFO_ARCH_NAME.search(line)
-            if m and chip in m.group(1).strip():
-                found_chip = True
-        if found_chip:
-            compute_unit = INFO_ARCH_CU.search(line)
-            if compute_unit:
-                return int(compute_unit.group(1))
-    assert False, f"Cannot find number of CUs for {chip}"
+def get_num_cu(device_id: int = 0):
+    # In native mode, min_num_cu contains the actual number of CUs instead of the minimum
+    return lookup_arch_info(f"native:{device_id}").min_num_cu
 
 
 def found_external_tool(paths: Paths,
@@ -2366,8 +2351,8 @@ def main(args=None):
 
     arch = get_arch()
     chip = get_chip()
-    num_cu = get_num_cu(chip)
-    num_chiplets = get_num_chiplets(chip, num_cu)
+    num_cu = get_num_cu()
+    num_chiplets = get_num_chiplets()
     initialize_dtypes_attn()
 
     root_dir = str(
diff --git a/mlir/utils/performance/tests/mock_hip.py b/mlir/utils/performance/tests/mock_hip.py
index 8ff303bab7ec..cd637869464a 100644
--- a/mlir/utils/performance/tests/mock_hip.py
+++ b/mlir/utils/performance/tests/mock_hip.py
@@ -1,4 +1,4 @@
-"""Inject mock 'hip' module so perfRunner can be imported without ROCm (e.g. in CI)."""
+"""Inject mock 'hip' and 'amd_arch_db' modules so perfRunner can be imported without ROCm."""
 import sys
 import types
 
@@ -33,3 +33,77 @@ class _MockHip:
     hip_pkg = types.ModuleType("hip")
     hip_pkg.hip = _MockHip()
     sys.modules["hip"] = hip_pkg
+
+# --- Mock amd_arch_db (compiled C++ extension, unavailable in CI) ---
+# Keep bit positions in sync with GemmFeatures in
+# mlir/include/mlir/Dialect/Rock/IR/RockAttrDefs.td
+
+
+class _MockGemmFeatures:
+    """Minimal mock of the GemmFeatures enum with arithmetic support."""
+
+    def __init__(self, value=0):
+        self._value = int(value)
+
+    def __int__(self):
+        return self._value
+
+    def __and__(self, other):
+        return _MockGemmFeatures(self._value & int(other))
+
+    def __or__(self, other):
+        return _MockGemmFeatures(self._value | int(other))
+
+    def __bool__(self):
+        return self._value != 0
+
+
+_MockGemmFeatures.NONE = _MockGemmFeatures(0)
+_MockGemmFeatures.MFMA = _MockGemmFeatures(1 << 0)
+_MockGemmFeatures.WMMA = _MockGemmFeatures(1 << 1)
+_MockGemmFeatures.DOT = _MockGemmFeatures(1 << 2)
+_MockGemmFeatures.ATOMIC_ADD = _MockGemmFeatures(1 << 3)
+_MockGemmFeatures.ATOMIC_ADD_BF16 = _MockGemmFeatures(1 << 4)
+_MockGemmFeatures.ATOMIC_ADD_F16 = _MockGemmFeatures(1 << 5)
+_MockGemmFeatures.ATOMIC_FMAX_F32 = _MockGemmFeatures(1 << 6)
+_MockGemmFeatures.DIRECT_TO_LDS_32B = _MockGemmFeatures(1 << 7)
+_MockGemmFeatures.DIRECT_TO_LDS_128B = _MockGemmFeatures(1 << 8)
+
+
+class _MockAmdArchInfo:
+
+    def __init__(self, **kwargs):
+        self.default_features = kwargs.get("default_features", _MockGemmFeatures(0))
+        self.wave_size = kwargs.get("wave_size", 64)
+        self.max_waves_per_eu = kwargs.get("max_waves_per_eu", 10)
+        self.total_sgpr_per_eu = kwargs.get("total_sgpr_per_eu", 512)
+        self.total_vgpr_per_eu = kwargs.get("total_vgpr_per_eu", 256)
+        self.total_shared_mem_per_cu = kwargs.get("total_shared_mem_per_cu", 65536)
+        self.max_shared_mem_per_wg = kwargs.get("max_shared_mem_per_wg", 65536)
+        self.num_eu_per_cu = kwargs.get("num_eu_per_cu", 4)
+        self.min_num_cu = kwargs.get("min_num_cu", 64)
+        self.has_fp8_conversion_instrs = kwargs.get("has_fp8_conversion_instrs", False)
+        self.has_ocp_fp8_conversion_instrs = kwargs.get("has_ocp_fp8_conversion_instrs", False)
+        self.has_fp4 = kwargs.get("has_fp4", False)
+        self.has_scaled_gemm = kwargs.get("has_scaled_gemm", False)
+        self.max_num_xcc = kwargs.get("max_num_xcc", 1)
+        self.has_lds_transpose_load = kwargs.get("has_lds_transpose_load", False)
+
+
+_DEFAULT_MOCK_INFO = _MockAmdArchInfo()
+
+
+def _mock_lookup_arch_info(arch):
+    return _DEFAULT_MOCK_INFO
+
+
+def _mock_has_feature(features, flag) -> bool:
+    return bool(int(features) & int(flag))
+
+
+if "amd_arch_db" not in sys.modules:
+    amd_arch_db_mod = types.ModuleType("amd_arch_db")
+    amd_arch_db_mod.GemmFeatures = _MockGemmFeatures
+    amd_arch_db_mod.lookup_arch_info = _mock_lookup_arch_info
+    amd_arch_db_mod.has_feature = _mock_has_feature
+    sys.modules["amd_arch_db"] = amd_arch_db_mod
diff --git a/mlir/utils/performance/tests/test_perfRunner.py b/mlir/utils/performance/tests/test_perfRunner.py
index 447c6e4076fe..6bb303b75974 100644
--- a/mlir/utils/performance/tests/test_perfRunner.py
+++ b/mlir/utils/performance/tests/test_perfRunner.py
@@ -8,6 +8,7 @@
 import os
 import sys
 import tempfile
+import types
 from pathlib import Path
 
 # Ensure we can import from parent (perfRunner lives in mlir/utils/performance)
@@ -122,20 +123,49 @@ def test_read_nonexistent_returns_none(self):
 
 
 class TestGetNumChiplets:
-    """Tests for get_num_chiplets (pure logic, no GPU)."""
+    """Tests for get_num_chiplets (delegates to amd_arch_db)."""
 
-    def test_gfx942_304(self):
-        assert perfRunner.get_num_chiplets("gfx942", 304) == 8
+    def test_default_is_one(self):
+        assert perfRunner.get_num_chiplets() == 1
 
-    def test_gfx942_80(self):
-        assert perfRunner.get_num_chiplets("gfx942", 80) == 4
+    def test_forwards_max_num_xcc(self, monkeypatch):
+        monkeypatch.setattr(perfRunner, "lookup_arch_info",
+                            lambda arch: types.SimpleNamespace(max_num_xcc=8))
+        assert perfRunner.get_num_chiplets() == 8
 
-    def test_gfx950(self):
-        assert perfRunner.get_num_chiplets("gfx950", 228) == 8
+    def test_passes_device_id(self, monkeypatch):
+        captured = {}
 
-    def test_default_one(self):
-        assert perfRunner.get_num_chiplets("gfx900", 64) == 1
-        assert perfRunner.get_num_chiplets("gfx1030", 72) == 1
+        def fake_lookup(arch):
+            captured["arch"] = arch
+            return types.SimpleNamespace(max_num_xcc=4)
+
+        monkeypatch.setattr(perfRunner, "lookup_arch_info", fake_lookup)
+        assert perfRunner.get_num_chiplets(2) == 4
+        assert captured["arch"] == "native:2"
+
+
+class TestGetNumCu:
+    """Tests for get_num_cu (delegates to amd_arch_db)."""
+
+    def test_default_is_mock_value(self):
+        assert perfRunner.get_num_cu() == 64
+
+    def test_forwards_min_num_cu(self, monkeypatch):
+        monkeypatch.setattr(perfRunner, "lookup_arch_info",
+                            lambda arch: types.SimpleNamespace(min_num_cu=304))
+        assert perfRunner.get_num_cu() == 304
+
+    def test_passes_device_id(self, monkeypatch):
+        captured = {}
+
+        def fake_lookup(arch):
+            captured["arch"] = arch
+            return types.SimpleNamespace(min_num_cu=80)
+
+        monkeypatch.setattr(perfRunner, "lookup_arch_info", fake_lookup)
+        assert perfRunner.get_num_cu(1) == 80
+        assert captured["arch"] == "native:1"
 
 
 class TestParseDataTypes:
diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py
index 78699bb0ef92..a7ad60ce8d1d 100755
--- a/mlir/utils/performance/tuningRunner.py
+++ b/mlir/utils/performance/tuningRunner.py
@@ -1957,8 +1957,8 @@ def main(args=None):
 
     arch = perfRunner.get_arch()
     chip = perfRunner.get_chip()
-    num_cu = perfRunner.get_num_cu(chip)
-    num_chiplets = perfRunner.get_num_chiplets(chip, num_cu)
+    num_cu = perfRunner.get_num_cu()
+    num_chiplets = perfRunner.get_num_chiplets()
 
     # Handle stdin for configs file
     stdin_temp_file = None