diff --git a/mlir/include/mlir/Dialect/Rock/IR/AmdArchDb.h b/mlir/include/mlir/Dialect/Rock/IR/AmdArchDb.h index 8835dd96235e..e192d553a39c 100644 --- a/mlir/include/mlir/Dialect/Rock/IR/AmdArchDb.h +++ b/mlir/include/mlir/Dialect/Rock/IR/AmdArchDb.h @@ -33,6 +33,7 @@ struct AmdArchInfo { int64_t minNumCU; bool hasFp8ConversionInstrs; bool hasOcpFp8ConversionInstrs; + bool hasFp4; bool hasScaledGemm; int64_t maxNumXCC; bool hasLdsTransposeLoad; @@ -42,14 +43,15 @@ struct AmdArchInfo { int64_t totalVGPRPerEU, int64_t sharedMemPerCU, int64_t sharedMemPerWG, int64_t numEUPerCU, int64_t minNumCU, bool hasFp8ConversionInstrs, - bool hasOcpFp8ConversionInstrs, bool hasScaledGemm, - int64_t maxNumXCC, bool hasLdsTransposeLoad) + bool hasOcpFp8ConversionInstrs, bool hasFp4, + bool hasScaledGemm, int64_t maxNumXCC, + bool hasLdsTransposeLoad) : defaultFeatures(defaultFeatures), waveSize(waveSize), maxWavesPerEU(maxWavesPerEU), totalSGPRPerEU(totalSGPRPerEU), totalVGPRPerEU(totalVGPRPerEU), totalSharedMemPerCU(sharedMemPerCU), maxSharedMemPerWG(sharedMemPerWG), numEUPerCU(numEUPerCU), minNumCU(minNumCU), hasFp8ConversionInstrs(hasFp8ConversionInstrs), - hasOcpFp8ConversionInstrs(hasOcpFp8ConversionInstrs), + hasOcpFp8ConversionInstrs(hasOcpFp8ConversionInstrs), hasFp4(hasFp4), hasScaledGemm(hasScaledGemm), maxNumXCC(maxNumXCC), hasLdsTransposeLoad(hasLdsTransposeLoad) {} diff --git a/mlir/lib/Dialect/Rock/IR/AmdArchDb.cpp b/mlir/lib/Dialect/Rock/IR/AmdArchDb.cpp index e222cba2831c..71c4b568215e 100644 --- a/mlir/lib/Dialect/Rock/IR/AmdArchDb.cpp +++ b/mlir/lib/Dialect/Rock/IR/AmdArchDb.cpp @@ -45,14 +45,16 @@ static constexpr AmdArchInfo /*totalVGPRPerEU*/ 256, /*totalSharedMemPerCU*/ 65536, /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/80, /*hasFp8ConversionInstrs=*/false, - /*hasOcpFp8ConversionInstrs=*/false, /*hasScaledGemm=*/false, + /*hasOcpFp8ConversionInstrs=*/false, /*hasFp4=*/false, + /*hasScaledGemm=*/false, /*maxNumXCC=*/1, /*hasLdsTransposeLoad=*/false), cdna50Info(GemmFeatures::dot, /*waveSize=*/64, /*maxWavesPerEU*/ 8, /*totalSGPRPerEU*/ 512, /*totalVGPRPerEU*/ 256, /*totalSharedMemPerCU*/ 65536, /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/10, /*hasFp8ConversionInstrs=*/false, - /*hasOcpFp8ConversionInstrs=*/false, /*hasScaledGemm=*/false, + /*hasOcpFp8ConversionInstrs=*/false, /*hasFp4=*/false, + /*hasScaledGemm=*/false, /*maxNumXCC=*/1, /*hasLdsTransposeLoad=*/false), cdnaInfo(GemmFeatures::mfma | GemmFeatures::dot | GemmFeatures::atomic_add | GemmFeatures::atomic_add_f16, @@ -60,7 +62,8 @@ static constexpr AmdArchInfo /*totalVGPRPerEU*/ 256, /*totalSharedMemPerCU*/ 65536, /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/120, /*hasFp8ConversionInstrs=*/false, - /*hasOcpFp8ConversionInstrs=*/false, /*hasScaledGemm=*/false, + /*hasOcpFp8ConversionInstrs=*/false, /*hasFp4=*/false, + /*hasScaledGemm=*/false, /*maxNumXCC=*/1, /*hasLdsTransposeLoad=*/false), cdna2Info(GemmFeatures::mfma | GemmFeatures::dot | GemmFeatures::atomic_add | GemmFeatures::atomic_add_f16, @@ -68,7 +71,8 @@ static constexpr AmdArchInfo /*totalVGPRPerEU*/ 512, /*totalSharedMemPerCU*/ 65536, /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/104, /*hasFp8ConversionInstrs=*/false, - /*hasOcpFp8ConversionInstrs=*/false, /*hasScaledGemm=*/false, + /*hasOcpFp8ConversionInstrs=*/false, /*hasFp4=*/false, + /*hasScaledGemm=*/false, /*maxNumXCC=*/1, /*hasLdsTransposeLoad=*/false), cdna3Info(GemmFeatures::mfma | GemmFeatures::dot | GemmFeatures::atomic_add | GemmFeatures::atomic_add_f16 | @@ -77,7 +81,8 @@ static constexpr AmdArchInfo /*totalVGPRPerEU*/ 512, /*totalSharedMemPerCU*/ 65536, /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/20, /*hasFp8ConversionInstrs=*/true, - /*hasOcpFp8ConversionInstrs=*/false, /*hasScaledGemm=*/false, + /*hasOcpFp8ConversionInstrs=*/false, /*hasFp4=*/false, + /*hasScaledGemm=*/false, /*maxNumXCC=*/8, /*hasLdsTransposeLoad=*/false), cdna40Info(GemmFeatures::mfma | GemmFeatures::dot | GemmFeatures::atomic_add | GemmFeatures::atomic_add_f16 | @@ -88,7 +93,8 @@ static constexpr AmdArchInfo /*totalVGPRPerEU*/ 512, /*totalSharedMemPerCU*/ 163840, /*maxSharedMemPerWG*/ 163840, /*numEUPerCU=*/4, /*minNumCU=*/256, /*hasFp8ConversionInstrs=*/false, - /*hasOcpFp8ConversionInstrs=*/true, /*hasScaledGemm=*/true, + /*hasOcpFp8ConversionInstrs=*/true, /*hasFp4=*/true, + /*hasScaledGemm=*/true, /*maxNumXCC=*/8, /*hasLdsTransposeLoad=*/true), // amdgpu target builds all RDNA in WGP Mode rdnaNoDotInfo(GemmFeatures::atomic_fmax_f32, /*waveSize=*/32, @@ -97,14 +103,16 @@ static constexpr AmdArchInfo /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/30, /*hasFp8ConversionInstrs=*/false, - /*hasOcpFp8ConversionInstrs=*/false, /*hasScaledGemm=*/false, + /*hasOcpFp8ConversionInstrs=*/false, /*hasFp4=*/false, + /*hasScaledGemm=*/false, /*maxNumXCC=*/1, /*hasLdsTransposeLoad=*/false), rdnaInfo(GemmFeatures::dot | GemmFeatures::atomic_fmax_f32, /*waveSize=*/32, /*maxWavesPerEU*/ 16, /*totalSGPRPerEU*/ 512, /*totalVGPRPerEU*/ 1024, /*totalSharedMemPerCU*/ 131072, /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/2, /*hasFp8ConversionInstrs=*/false, - /*hasOcpFp8ConversionInstrs=*/false, /*hasScaledGemm=*/false, + /*hasOcpFp8ConversionInstrs=*/false, /*hasFp4=*/false, + /*hasScaledGemm=*/false, /*maxNumXCC=*/1, /*hasLdsTransposeLoad=*/false), rdna3Info(GemmFeatures::dot | GemmFeatures::atomic_add | GemmFeatures::atomic_fmax_f32 | GemmFeatures::wmma, @@ -112,7 +120,8 @@ static constexpr AmdArchInfo /*totalVGPRPerEU*/ 1536, /*totalSharedMemPerCU*/ 131072, /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/2, /*hasFp8ConversionInstrs=*/false, - /*hasOcpFp8ConversionInstrs=*/false, /*hasScaledGemm=*/false, + /*hasOcpFp8ConversionInstrs=*/false, /*hasFp4=*/false, + /*hasScaledGemm=*/false, /*maxNumXCC=*/1, /*hasLdsTransposeLoad=*/false), rdna4Info(GemmFeatures::dot | GemmFeatures::atomic_add | GemmFeatures::atomic_fmax_f32 | GemmFeatures::wmma | @@ -121,7 +130,8 @@ static constexpr AmdArchInfo /*totalVGPRPerEU*/ 1536, /*totalSharedMemPerCU*/ 131072, /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/12, /*hasFp8ConversionInstrs=*/false, - /*hasOcpFp8ConversionInstrs=*/true, /*hasScaledGemm=*/false, + /*hasOcpFp8ConversionInstrs=*/true, /*hasFp4=*/false, + /*hasScaledGemm=*/false, /*maxNumXCC=*/1, /*hasLdsTransposeLoad=*/false), // TODO: update with right information gfx1250Info(GemmFeatures::dot | GemmFeatures::atomic_add | @@ -132,7 +142,8 @@ static constexpr AmdArchInfo /*totalVGPRPerEU*/ 1536, /*totalSharedMemPerCU*/ 131072, /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/12, /*hasFp8ConversionInstrs=*/false, - /*hasOcpFp8ConversionInstrs=*/true, /*hasScaledGemm=*/false, + /*hasOcpFp8ConversionInstrs=*/true, /*hasFp4=*/false, + /*hasScaledGemm=*/false, /*maxNumXCC=*/1, /*hasLdsTransposeLoad=*/false); static std::tuple parseArchString(StringRef arch) { @@ -360,8 +371,6 @@ AmdArchInfo nativeArchInfo(unsigned deviceId = 0) { #endif // !_WIN32 && ROCMLIR_ENABLE_NATIVE_ARCH AmdArchInfo mlir::rock::lookupArchInfo(StringRef arch) { - // Keep this implementation in sync with - // mlir/test/lit.site.cfg.py.in:set_arch_features() auto [chip, deviceId] = parseArchString(arch); if (chip == "native") { #if !defined(_WIN32) && defined(ROCMLIR_ENABLE_NATIVE_ARCH) diff --git a/mlir/lib/Dialect/Rock/utility/Bindings/AmdArchDbBindings.cpp b/mlir/lib/Dialect/Rock/utility/Bindings/AmdArchDbBindings.cpp index ada137be7a3a..e9909ae2894e 100644 --- a/mlir/lib/Dialect/Rock/utility/Bindings/AmdArchDbBindings.cpp +++ b/mlir/lib/Dialect/Rock/utility/Bindings/AmdArchDbBindings.cpp @@ -10,13 +10,14 @@ #include #include "mlir/Dialect/Rock/IR/AmdArchDb.h" +#include "llvm/ADT/StringRef.h" namespace py = pybind11; PYBIND11_MODULE(amd_arch_db, m) { m.doc() = "Database of AMD GPU features"; - py::enum_(m, "GemmFeatures") + py::enum_(m, "GemmFeatures", py::arithmetic()) .value("NONE", mlir::rock::GemmFeatures::none) .value("MFMA", mlir::rock::GemmFeatures::mfma) .value("WMMA", mlir::rock::GemmFeatures::wmma) @@ -24,7 +25,10 @@ PYBIND11_MODULE(amd_arch_db, m) { .value("ATOMIC_ADD", mlir::rock::GemmFeatures::atomic_add) .value("ATOMIC_ADD_BF16", mlir::rock::GemmFeatures::atomic_add_bf16) .value("ATOMIC_ADD_F16", mlir::rock::GemmFeatures::atomic_add_f16) - .value("ATOMIC_FMAX_F32", mlir::rock::GemmFeatures::atomic_fmax_f32); + .value("ATOMIC_FMAX_F32", mlir::rock::GemmFeatures::atomic_fmax_f32) + .value("DIRECT_TO_LDS_32B", mlir::rock::GemmFeatures::direct_to_lds_32b) + .value("DIRECT_TO_LDS_128B", + mlir::rock::GemmFeatures::direct_to_lds_128b); py::class_(m, "AmdArchInfo") .def_readonly("default_features", @@ -45,12 +49,31 @@ PYBIND11_MODULE(amd_arch_db, m) { &mlir::rock::AmdArchInfo::hasFp8ConversionInstrs) .def_readonly("has_ocp_fp8_conversion_instrs", &mlir::rock::AmdArchInfo::hasOcpFp8ConversionInstrs) + .def_readonly("has_fp4", &mlir::rock::AmdArchInfo::hasFp4) .def_readonly("has_scaled_gemm", &mlir::rock::AmdArchInfo::hasScaledGemm) .def_readonly("max_num_xcc", &mlir::rock::AmdArchInfo::maxNumXCC) .def_readonly("has_lds_transpose_load", &mlir::rock::AmdArchInfo::hasLdsTransposeLoad); + m.def( + "has_feature", + [](mlir::rock::GemmFeatures features, mlir::rock::GemmFeatures flag) { + return bitEnumContainsAny(features, flag); + }, + "Return True if any bit set in `flag` is also set in `features`. " + "Matches `bool(int(features) & int(flag))`."); + m.def("lookup_arch_info", [](const std::string &arch) { + // The "native:" code path in lookupArchInfo requires the build to + // have been configured with ROCMLIR_ENABLE_NATIVE_ARCH=ON. Without it the + // underlying call hits an llvm_unreachable, which would abort the Python + // interpreter; raise a Python-level error instead. +#ifndef ROCMLIR_ENABLE_NATIVE_ARCH + if (llvm::StringRef(arch).starts_with("native")) + throw py::value_error( + "\"native\" arch lookup is not available in this build " + "(requires ROCMLIR_ENABLE_NATIVE_ARCH=ON)"); +#endif return mlir::rock::lookupArchInfo(arch); }); } diff --git a/mlir/lib/Dialect/Rock/utility/Bindings/CMakeLists.txt b/mlir/lib/Dialect/Rock/utility/Bindings/CMakeLists.txt index efc13d07ff5e..cf8be7ae29c3 100644 --- a/mlir/lib/Dialect/Rock/utility/Bindings/CMakeLists.txt +++ b/mlir/lib/Dialect/Rock/utility/Bindings/CMakeLists.txt @@ -27,4 +27,12 @@ if(NOT WIN32) pybind11_add_module(amd_arch_db AmdArchDbBindings.cpp) target_link_libraries(amd_arch_db PUBLIC MLIRRockUtility) + if(ROCMLIR_ENABLE_NATIVE_ARCH) + target_compile_definitions(amd_arch_db PRIVATE ROCMLIR_ENABLE_NATIVE_ARCH=1) + endif() + set_target_properties(amd_arch_db PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${ROCMLIR_BIN_DIR}") + message(VERBOSE "amd_arch_db Python binding will be built in ${ROCMLIR_BIN_DIR}") + message(VERBOSE "To use it outside the build bin directory, set: " + "export PYTHONPATH=${ROCMLIR_BIN_DIR}:\$PYTHONPATH") endif() diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt index c2852d8720a3..a11fbf077298 100644 --- a/mlir/test/CMakeLists.txt +++ b/mlir/test/CMakeLists.txt @@ -110,6 +110,10 @@ list(APPEND ROCMLIR_TEST_DEPENDS rocmlir-common-python-test-utils ) +if (TARGET amd_arch_db) + list(APPEND ROCMLIR_TEST_DEPENDS amd_arch_db) +endif() + if(MLIR_ENABLE_ROCM_RUNNER) list(APPEND ROCMLIR_TEST_DEPENDS mlir_runner_utils diff --git a/mlir/test/common_utils/common.py b/mlir/test/common_utils/common.py index 60c70de500f1..8833291327cd 100644 --- a/mlir/test/common_utils/common.py +++ b/mlir/test/common_utils/common.py @@ -1,50 +1,31 @@ from hip import hip +from amd_arch_db import GemmFeatures, has_feature, lookup_arch_info -# Helper function to decode arch to its features -# Keep this in sync with mlir/lib/Dialect/Rock/Generator/AmdArchDb.cpp:mlir::rock::lookupArchInfo -def get_arch_features(arch: str): - chip_name = arch.split(':')[0] - if len(chip_name) < 5: - return +def features_to_string(features): + val = int(features) + if val == 0: + return 'none' + # Iteration follows the .value(...) chain in + # mlir/lib/Dialect/Rock/utility/Bindings/AmdArchDbBindings.cpp, which is + # kept in sync with the bit positions in RockAttrDefs.td. Do not reorder + # without updating the bindings; lit tests match on this exact spelling. + names = [] + for name, member in GemmFeatures.__members__.items(): + bit = int(member) + if bit and (val & bit): + names.append(name.lower()) + return '|'.join(names) + - arch_features = None - support_mfma = False - support_wmma = False - support_accel_fp8 = False - major = chip_name[:-2] - minor = chip_name[-2:] - if major == 'gfx9': - if minor in ['08', '0a']: - arch_features = 'mfma|dot|atomic_add|atomic_add_f16' - elif minor == '42': - arch_features = 'mfma|dot|atomic_add|atomic_add_f16|direct_to_lds_32b' - support_accel_fp8 = True - elif minor == '50': - arch_features = 'mfma|dot|atomic_add|atomic_add_f16|atomic_add_bf16|direct_to_lds_32b|direct_to_lds_128b|lds_transpose_load' - support_accel_fp8 = True - elif minor == '06': - arch_features = 'dot' - else: - arch_features = 'none' - elif major == 'gfx10': - if minor in ['11', '13']: - arch_features = 'atomic_fmax_f32' - elif minor in ['10', '12'] or minor[0] == '3': - arch_features = 'dot|atomic_fmax_f32' - else: - arch_features = 'atomic_fmax_f32' - elif major == 'gfx11': - arch_features = 'dot|atomic_add|atomic_fmax_f32|wmma' - elif major == 'gfx12': - arch_features = 'dot|atomic_add|atomic_add_f16|atomic_add_bf16|atomic_fmax_f32|wmma' - support_accel_fp8 = True - if arch_features and 'mfma' in arch_features: - support_mfma = True - pass - elif arch_features and 'wmma' in arch_features: - support_wmma = True - pass +def get_arch_features(arch: str): + info = lookup_arch_info(arch) + arch_features = features_to_string(info.default_features) + if info.has_lds_transpose_load: + arch_features += '|lds_transpose_load' + support_mfma = has_feature(info.default_features, GemmFeatures.MFMA) + support_wmma = has_feature(info.default_features, GemmFeatures.WMMA) + support_accel_fp8 = info.has_fp8_conversion_instrs or info.has_ocp_fp8_conversion_instrs return arch_features, support_mfma, support_wmma, support_accel_fp8 @@ -82,4 +63,6 @@ def get_default_agent(): def is_xdlops_present() -> bool: """This function checks whether a GPU with xdlops support is present""" - return any([agent.startswith("gfx9") for agent in get_agents()]) + return any( + has_feature(lookup_arch_info(agent).default_features, GemmFeatures.MFMA) + for agent in get_agents()) diff --git a/mlir/test/e2e/lit.site.cfg.py.in b/mlir/test/e2e/lit.site.cfg.py.in index c0ebd0c6f128..0e1aead518e0 100644 --- a/mlir/test/e2e/lit.site.cfg.py.in +++ b/mlir/test/e2e/lit.site.cfg.py.in @@ -28,6 +28,7 @@ config.rocmlir_common_python_tests_utils = "@ROCMLIR_COMMON_PYTHON_TESTS_UTILS@" # Add common python test utils sys.path.append(config.rocmlir_common_python_tests_utils) +sys.path.append(config.mlir_rock_tools_dir) from common import get_agents, get_arch_features, get_default_agent # Support substitution of the tools_dir with user parameters. This is diff --git a/mlir/test/fusion/e2e/lit.site.cfg.py.in b/mlir/test/fusion/e2e/lit.site.cfg.py.in index d0a38d7f3c81..278cb041b42b 100644 --- a/mlir/test/fusion/e2e/lit.site.cfg.py.in +++ b/mlir/test/fusion/e2e/lit.site.cfg.py.in @@ -35,7 +35,8 @@ config.rocmlir_common_python_tests_utils = "@ROCMLIR_COMMON_PYTHON_TESTS_UTILS@" # Add common python test utils sys.path.append(config.rocmlir_common_python_tests_utils) -from common import get_agents, get_default_agent +sys.path.append(config.mlir_rock_tools_dir) +from common import get_agents, get_arch_features, get_default_agent # Support substitution of the tools_dir with user parameters. This is # used when we can't determine the tool dir at configuration time. @@ -78,11 +79,7 @@ if config.rocm_path: "HIP_VISIBLE_DEVICES will be set to '0' to ensure binary compatibility." % (', '.join(sorted(agents)), default_agent)) config.arch = default_agent - # Check features for the device we'll actually use - if any([arch in default_agent for arch in ["gfx908", "gfx90a", "gfx942", "gfx950"]]): - config.arch_support_mfma = True - elif "gfx11" in default_agent or "gfx12" in default_agent: - config.arch_support_wmma = True + _, config.arch_support_mfma, config.arch_support_wmma, _ = get_arch_features(default_agent) if not config.arch: config.no_AMD_GPU = True except subprocess.CalledProcessError: diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in index fa8e08a52393..fb6443ea2853 100644 --- a/mlir/test/lit.site.cfg.py.in +++ b/mlir/test/lit.site.cfg.py.in @@ -59,6 +59,7 @@ config.rocmlir_common_python_tests_utils = "@ROCMLIR_COMMON_PYTHON_TESTS_UTILS@" # Add common python test utils sys.path.append(config.rocmlir_common_python_tests_utils) +sys.path.append(config.mlir_rock_tools_dir) from common import get_agents, get_arch_features, get_default_agent # Support substitution of the tools_dir with user parameters. This is diff --git a/mlir/unittests/Dialect/Rock/AmdArchDbTests.cpp b/mlir/unittests/Dialect/Rock/AmdArchDbTests.cpp index c30ac8f152b4..f0f04b0c32ed 100644 --- a/mlir/unittests/Dialect/Rock/AmdArchDbTests.cpp +++ b/mlir/unittests/Dialect/Rock/AmdArchDbTests.cpp @@ -57,7 +57,10 @@ TEST_P(NativeArchTest, NativeArchInfoMatchesPresetInfo) { nativeInfo.hasFp8ConversionInstrs); EXPECT_EQ(presetInfo.hasOcpFp8ConversionInstrs, nativeInfo.hasOcpFp8ConversionInstrs); + EXPECT_EQ(presetInfo.hasFp4, nativeInfo.hasFp4); + EXPECT_EQ(presetInfo.hasScaledGemm, nativeInfo.hasScaledGemm); EXPECT_GE(presetInfo.maxNumXCC, nativeInfo.maxNumXCC); + EXPECT_EQ(presetInfo.hasLdsTransposeLoad, nativeInfo.hasLdsTransposeLoad); } INSTANTIATE_TEST_SUITE_P(NativeArchTests, NativeArchTest, diff --git a/mlir/utils/performance/CMakeLists.txt b/mlir/utils/performance/CMakeLists.txt index 2b19d483b0b2..c28085f74d58 100644 --- a/mlir/utils/performance/CMakeLists.txt +++ b/mlir/utils/performance/CMakeLists.txt @@ -25,3 +25,7 @@ list(TRANSFORM PERFORMANCE_SCRIPTS PREPEND "${CMAKE_CURRENT_SOURCE_DIR}/") add_custom_target(ci-performance-scripts COMMAND ${CMAKE_COMMAND} -E copy ${PERFORMANCE_SCRIPTS} ${ROCMLIR_BIN_DIR}) + +if (TARGET amd_arch_db) + add_dependencies(ci-performance-scripts amd_arch_db) +endif() diff --git a/mlir/utils/performance/analysis/quickTuningGen.py b/mlir/utils/performance/analysis/quickTuningGen.py index ed0b745363a5..dc87e8df39a4 100644 --- a/mlir/utils/performance/analysis/quickTuningGen.py +++ b/mlir/utils/performance/analysis/quickTuningGen.py @@ -14,6 +14,8 @@ import pandas as pd import pulp +from amd_arch_db import GemmFeatures, has_feature, lookup_arch_info + # Column definitions for grouping problems GEMM_COLUMNS = ['TransA', 'TransB', 'G', 'M', 'K', 'N'] CONV_COLUMNS = [ @@ -37,9 +39,10 @@ def get_instruction_type(arch, dtype, op): """Determine instruction type based on architecture, data type, and operation.""" if op == "attention": return "GemmGemm" - if arch.startswith("gfx9"): + features = lookup_arch_info(arch).default_features + if has_feature(features, GemmFeatures.MFMA): return "XDL" - elif arch.startswith("gfx1") and dtype != "f32": + if has_feature(features, GemmFeatures.WMMA) and dtype != "f32": return "Wmma" return "NonAccel" diff --git a/mlir/utils/performance/analysis/testing-metrics.py b/mlir/utils/performance/analysis/testing-metrics.py index d4ee7780bb88..03587a7d016d 100644 --- a/mlir/utils/performance/analysis/testing-metrics.py +++ b/mlir/utils/performance/analysis/testing-metrics.py @@ -17,33 +17,21 @@ import argparse import math import os -from hip import hip -# TODO use AmdArchDb.py (when it's implemented) +from amd_arch_db import lookup_arch_info -num_eu_per_cu = 4 # may be changed in newer architectures - -def hip_check(call_result): - err = call_result[0] - result = call_result[1:] - if len(result) == 1: - result = result[0] - if isinstance(err, hip.hipError_t) and err != hip.hipError_t.hipSuccess: - raise RuntimeError(str(err)) - return result +def get_num_eu_per_cu(): + return lookup_arch_info("native:0").num_eu_per_cu def assign_num_cu(): if args.c: return int(args.c) - else: - props = hip.hipDeviceProp_t() - hip_check(hip.hipGetDeviceProperties(props, 0)) - print( - "Using info from GPU 0 in your system, the data should have be obtained from the same GPU." - ) - return int(props.multiProcessorCount) + print( + "Using info from GPU 0 in your system, the data should have been obtained from the same GPU." + ) + return lookup_arch_info("native:0").min_num_cu def analyze_gemm_file(file, n): @@ -244,7 +232,7 @@ def determine_file_type(file): args = parser.parse_args() num_cus = assign_num_cu() - min_num_waves = num_cus * num_eu_per_cu + min_num_waves = num_cus * get_num_eu_per_cu() row_list = [] diff --git a/mlir/utils/performance/attentionSweeps.py b/mlir/utils/performance/attentionSweeps.py index 914c0054c08c..7f31223c8ce8 100755 --- a/mlir/utils/performance/attentionSweeps.py +++ b/mlir/utils/performance/attentionSweeps.py @@ -35,9 +35,9 @@ Options, sweep_parameters, multiline_repr, - infer_codegen_flags_from_arch, get_codegen_flags_for_codepath, ) +from amd_arch_db import GemmFeatures, has_feature, lookup_arch_info # GLOBAL VARIABLES DATA_TYPES_ATTENTION = initialize_dtypes_attn() @@ -206,13 +206,13 @@ def _infer_instruction_set(arch: str, requested: str) -> str: if requested in ('mfma', 'wmma'): return requested - codepath, _ = infer_codegen_flags_from_arch(arch) - if codepath == 'unknown': - raise RuntimeError(f"Unknown arch for attention sweep: {arch}") - if codepath == 'vanilla': - raise RuntimeError(f"Unsupported attention codepath '{codepath}' for arch {arch}. " - "Attention sweep requires MFMA or WMMA.") - return codepath + features = lookup_arch_info(arch).default_features + if has_feature(features, GemmFeatures.MFMA): + return 'mfma' + if has_feature(features, GemmFeatures.WMMA): + return 'wmma' + raise RuntimeError(f"Unsupported arch for attention sweep: {arch}. " + "Attention sweep requires MFMA or WMMA.") def _resolve_codegen_flags(arch: str, instruction_set: str) -> list[str]: @@ -272,7 +272,6 @@ def log_failing_configs(configs: List[AttentionConfiguration], filename: str): def run_attention_sweep(args, options, paths, chip): - # TODO: use AmdArchDb python version when available try: instruction_set = _infer_instruction_set(options.arch, args.codepath) except RuntimeError as e: @@ -354,7 +353,7 @@ def main(): if chip_match is None: raise RuntimeError(f"Could not find GFX chip in arch string: {arch}") chip = chip_match.group(0) - num_cu = get_num_cu(chip) + num_cu = get_num_cu() paths = create_paths(None, args.mlir_build_dir) options = Options(debug_fails=args.debug_fails, debug=args.debug, @@ -363,7 +362,7 @@ def main(): flags=[], concurrent_tests=args.jobs, num_cu=num_cu, - num_chiplets=get_num_chiplets(chip, num_cu), + num_chiplets=get_num_chiplets(), log_failures=args.log_failures, test_timeout_sec=args.test_timeout_sec) diff --git a/mlir/utils/performance/parameterSweeps.py b/mlir/utils/performance/parameterSweeps.py index d9776063cf3f..2f265ccd2c55 100755 --- a/mlir/utils/performance/parameterSweeps.py +++ b/mlir/utils/performance/parameterSweeps.py @@ -27,6 +27,8 @@ from perfRunner import get_num_cu from perfRunner import get_num_chiplets +from amd_arch_db import GemmFeatures, has_feature, lookup_arch_info + @dataclass(frozen=True) class Options: @@ -62,16 +64,16 @@ async def _communicate_with_timeout(proc: asyncio.subprocess.Process, def get_codegen_flags_for_codepath(arch: str, codepath: str) -> list[str]: """Returns rocmlir-gen feature flags for a given codepath and architecture.""" + features = lookup_arch_info(arch).default_features + if codepath == 'mfma': flags = ['-mfma=on', '-dot=on', '-atomic_add=on', '-atomic_add_f16=on'] - if 'gfx942' in arch: + if has_feature(features, GemmFeatures.ATOMIC_ADD_BF16): + flags.append('-atomic_add_bf16=on') + if has_feature(features, GemmFeatures.DIRECT_TO_LDS_32B): flags.append('-direct_to_lds_32b=on') - elif 'gfx95' in arch: - flags.extend([ - '-atomic_add_bf16=on', - '-direct_to_lds_32b=on', - '-direct_to_lds_128b=on', - ]) + if has_feature(features, GemmFeatures.DIRECT_TO_LDS_128B): + flags.append('-direct_to_lds_128b=on') return flags if codepath == 'vanilla': @@ -79,8 +81,10 @@ def get_codegen_flags_for_codepath(arch: str, codepath: str) -> list[str]: if codepath == 'wmma': flags = ['-mfma=off', '-dot=on', '-atomic_add=on', '-wmma=infer'] - if 'gfx12' in arch: - flags.extend(['-atomic_add_f16=on', '-atomic_add_bf16=on']) + if has_feature(features, GemmFeatures.ATOMIC_ADD_F16): + flags.append('-atomic_add_f16=on') + if has_feature(features, GemmFeatures.ATOMIC_ADD_BF16): + flags.append('-atomic_add_bf16=on') return flags return [] @@ -94,30 +98,20 @@ def infer_codegen_flags_from_arch(arch: str, rely on rocmlir-gen arch auto-detection and return no explicit feature flags; flags are only emitted when a codepath override is explicitly requested. - - Returns ('unknown', []) when inference fails. """ supported_codepath = ['mfma', 'vanilla', 'wmma'] codepath = requested_codepath if codepath not in supported_codepath: - if 'gfx908' in arch or 'gfx90a' in arch: - codepath = 'mfma' - elif 'gfx942' in arch: - codepath = 'mfma' - elif 'gfx95' in arch: + features = lookup_arch_info(arch).default_features + if int(features) == 0: + return ('unknown', []) + elif has_feature(features, GemmFeatures.MFMA): codepath = 'mfma' - elif 'gfx906' in arch: - codepath = 'vanilla' - elif 'gfx1030' in arch: - # Use vanilla codepath for gfx1030 until it has its own perf configs. - codepath = 'vanilla' - elif 'gfx11' in arch: - codepath = 'wmma' - elif 'gfx12' in arch: + elif has_feature(features, GemmFeatures.WMMA): codepath = 'wmma' else: - return ('unknown', []) + codepath = 'vanilla' if requested_codepath in supported_codepath: return (codepath, get_codegen_flags_for_codepath(arch, codepath)) @@ -741,8 +735,7 @@ def main() -> bool: # For non-perf-config sweeps, let rocmlir-gen infer features from --arch. rocmlir_gen_flags = [] - chip = perfRunner.get_chip() - num_cu = get_num_cu(chip) + num_cu = get_num_cu() options = Options(debug=args.debug, quiet=args.quiet, log_failures=args.log_failures, @@ -751,7 +744,7 @@ def main() -> bool: flags=rocmlir_gen_flags, concurrent_tests=args.jobs, num_cu=num_cu, - num_chiplets=get_num_chiplets(chip, num_cu), + num_chiplets=get_num_chiplets(), test_timeout_sec=args.test_timeout_sec) paths = perfRunner.create_paths(None, args.mlir_build_dir) diff --git a/mlir/utils/performance/perfRegressionReport.py b/mlir/utils/performance/perfRegressionReport.py index 5a89a0d7b657..7a49b9a16331 100644 --- a/mlir/utils/performance/perfRegressionReport.py +++ b/mlir/utils/performance/perfRegressionReport.py @@ -18,12 +18,13 @@ def load_mlir_data(filename: str): 'LDSBankConflict (MIOpen)', 'LDSBankConflict (hipBLASLt)' ] df.drop(columns=columns_dropped, inplace=True, errors='ignore') - # Work around empty PerfConfig field whin migrating from no tuning to yes tuning + # Work around empty PerfConfig field when migrating from no tuning to yes tuning # Can be removed next time we touch this if 'PerfConfig' in df: df['PerfConfig'] = df['PerfConfig'].fillna('None') if 'numCU' not in df: - df.insert(4, 'numCU', get_num_cu(df['Chip'][0])) + # numCU is sourced from device 0 of the local host + df.insert(4, 'numCU', get_num_cu()) return df diff --git a/mlir/utils/performance/perfRunner.py b/mlir/utils/performance/perfRunner.py index 8f1add09c46e..fe9e5d6e38fd 100644 --- a/mlir/utils/performance/perfRunner.py +++ b/mlir/utils/performance/perfRunner.py @@ -20,7 +20,10 @@ import pandas as pd from hip import hip +from amd_arch_db import GemmFeatures, has_feature, lookup_arch_info + import reportUtils + from perfCommonUtils import Operation, GEMMLibrary # global variables. @@ -64,8 +67,6 @@ ELAPSED_TIME_RE = re.compile(r"Elapsed: ([0-9\.]*) ms") # Compiled regexp object used for extracting target chip from arch GFX_CHIP_RE = re.compile(r"gfx[0-9a-z]+") -INFO_ARCH_NAME = re.compile(r"Name:\s*(.*)") -INFO_ARCH_CU = re.compile(r"Compute Unit:\s*(.*)") def input_layouts(input_layout): @@ -188,12 +189,25 @@ def get_chip(): return chip +def chip_has_fp8(): + info = lookup_arch_info(get_chip()) + return info.has_fp8_conversion_instrs or info.has_ocp_fp8_conversion_instrs + + +def chip_has_fp4(): + return lookup_arch_info(get_chip()).has_fp4 + + +def chip_has_mfma(): + return has_feature(lookup_arch_info(get_chip()).default_features, GemmFeatures.MFMA) + + DATA_TYPES_ATTENTION = None def initialize_dtypes_attn(): global DATA_TYPES_ATTENTION - if get_chip().startswith('gfx9'): + if chip_has_mfma(): DATA_TYPES_ATTENTION = DATA_TYPES_ATTENTION_MFMA else: DATA_TYPES_ATTENTION = DATA_TYPES_ATTENTION_WMMA @@ -480,8 +494,7 @@ def get_conv_configurations(filename, arch, num_cu, num_chiplets): # Skip unsupported datatypes if datatype == 'convfp8': - unsupported_chips = {'gfx908', 'gfx90a', 'gfx1030', 'gfx1101'} - if get_chip() in unsupported_chips: + if not chip_has_fp8(): continue # Skip int8 non-fwd convolutions @@ -790,14 +803,11 @@ def get_gemm_configurations(filename, # Skip unsupported datatypes if datatype == 'f4E2M1FN': - # TODO: use information from AMDArchDB when it becomes available to determine supported chips - supported_chips = {'gfx950'} - if get_chip() not in supported_chips: + if not chip_has_fp4(): continue if datatype == 'fp8': - unsupported_chips = {'gfx908', 'gfx90a', 'gfx1030', 'gfx1101'} - if get_chip() in unsupported_chips: + if not chip_has_fp8(): continue # We need trailing spaces here to account for the concat below @@ -2296,39 +2306,14 @@ def parse_data_types(data_types): return datatypes, out_map -def get_num_chiplets(chip, num_cu): - # TODO: use AmdArchDb python bindings - if "gfx942" in chip and num_cu == 304: - return 8 - if "gfx942" in chip and num_cu == 80: - return 4 - if "gfx950" in chip: - return 8 +def get_num_chiplets(device_id: int = 0): + # In native mode, max_num_xcc contains the actual number of chiplets instead of the maximum + return lookup_arch_info(f"native:{device_id}").max_num_xcc - return 1 - -def get_num_cu(chip): - try: - rocminfo = subprocess.check_output("/opt/rocm/bin/rocminfo", stderr=subprocess.PIPE) - except subprocess.CalledProcessError as e: - print(e.stderr.decode('utf-8')) - raise - except Exception as e: - print(f"Exception: {e}") - raise - rocminfo_lines = rocminfo.decode("utf-8").split("\n") - found_chip = False - for line in rocminfo_lines: - if not found_chip: - m = INFO_ARCH_NAME.search(line) - if m and chip in m.group(1).strip(): - found_chip = True - if found_chip: - compute_unit = INFO_ARCH_CU.search(line) - if compute_unit: - return int(compute_unit.group(1)) - assert False, f"Cannot find number of CUs for {chip}" +def get_num_cu(device_id: int = 0): + # In native mode, min_num_cu contains the actual number of CUs instead of the minimum + return lookup_arch_info(f"native:{device_id}").min_num_cu def found_external_tool(paths: Paths, @@ -2366,8 +2351,8 @@ def main(args=None): arch = get_arch() chip = get_chip() - num_cu = get_num_cu(chip) - num_chiplets = get_num_chiplets(chip, num_cu) + num_cu = get_num_cu() + num_chiplets = get_num_chiplets() initialize_dtypes_attn() root_dir = str( diff --git a/mlir/utils/performance/tests/mock_hip.py b/mlir/utils/performance/tests/mock_hip.py index 8ff303bab7ec..cd637869464a 100644 --- a/mlir/utils/performance/tests/mock_hip.py +++ b/mlir/utils/performance/tests/mock_hip.py @@ -1,4 +1,4 @@ -"""Inject mock 'hip' module so perfRunner can be imported without ROCm (e.g. in CI).""" +"""Inject mock 'hip' and 'amd_arch_db' modules so perfRunner can be imported without ROCm.""" import sys import types @@ -33,3 +33,77 @@ class _MockHip: hip_pkg = types.ModuleType("hip") hip_pkg.hip = _MockHip() sys.modules["hip"] = hip_pkg + +# --- Mock amd_arch_db (compiled C++ extension, unavailable in CI) --- +# Keep bit positions in sync with GemmFeatures in +# mlir/include/mlir/Dialect/Rock/IR/RockAttrDefs.td + + +class _MockGemmFeatures: + """Minimal mock of the GemmFeatures enum with arithmetic support.""" + + def __init__(self, value=0): + self._value = int(value) + + def __int__(self): + return self._value + + def __and__(self, other): + return _MockGemmFeatures(self._value & int(other)) + + def __or__(self, other): + return _MockGemmFeatures(self._value | int(other)) + + def __bool__(self): + return self._value != 0 + + +_MockGemmFeatures.NONE = _MockGemmFeatures(0) +_MockGemmFeatures.MFMA = _MockGemmFeatures(1 << 0) +_MockGemmFeatures.WMMA = _MockGemmFeatures(1 << 1) +_MockGemmFeatures.DOT = _MockGemmFeatures(1 << 2) +_MockGemmFeatures.ATOMIC_ADD = _MockGemmFeatures(1 << 3) +_MockGemmFeatures.ATOMIC_ADD_BF16 = _MockGemmFeatures(1 << 4) +_MockGemmFeatures.ATOMIC_ADD_F16 = _MockGemmFeatures(1 << 5) +_MockGemmFeatures.ATOMIC_FMAX_F32 = _MockGemmFeatures(1 << 6) +_MockGemmFeatures.DIRECT_TO_LDS_32B = _MockGemmFeatures(1 << 7) +_MockGemmFeatures.DIRECT_TO_LDS_128B = _MockGemmFeatures(1 << 8) + + +class _MockAmdArchInfo: + + def __init__(self, **kwargs): + self.default_features = kwargs.get("default_features", _MockGemmFeatures(0)) + self.wave_size = kwargs.get("wave_size", 64) + self.max_waves_per_eu = kwargs.get("max_waves_per_eu", 10) + self.total_sgpr_per_eu = kwargs.get("total_sgpr_per_eu", 512) + self.total_vgpr_per_eu = kwargs.get("total_vgpr_per_eu", 256) + self.total_shared_mem_per_cu = kwargs.get("total_shared_mem_per_cu", 65536) + self.max_shared_mem_per_wg = kwargs.get("max_shared_mem_per_wg", 65536) + self.num_eu_per_cu = kwargs.get("num_eu_per_cu", 4) + self.min_num_cu = kwargs.get("min_num_cu", 64) + self.has_fp8_conversion_instrs = kwargs.get("has_fp8_conversion_instrs", False) + self.has_ocp_fp8_conversion_instrs = kwargs.get("has_ocp_fp8_conversion_instrs", False) + self.has_fp4 = kwargs.get("has_fp4", False) + self.has_scaled_gemm = kwargs.get("has_scaled_gemm", False) + self.max_num_xcc = kwargs.get("max_num_xcc", 1) + self.has_lds_transpose_load = kwargs.get("has_lds_transpose_load", False) + + +_DEFAULT_MOCK_INFO = _MockAmdArchInfo() + + +def _mock_lookup_arch_info(arch): + return _DEFAULT_MOCK_INFO + + +def _mock_has_feature(features, flag) -> bool: + return bool(int(features) & int(flag)) + + +if "amd_arch_db" not in sys.modules: + amd_arch_db_mod = types.ModuleType("amd_arch_db") + amd_arch_db_mod.GemmFeatures = _MockGemmFeatures + amd_arch_db_mod.lookup_arch_info = _mock_lookup_arch_info + amd_arch_db_mod.has_feature = _mock_has_feature + sys.modules["amd_arch_db"] = amd_arch_db_mod diff --git a/mlir/utils/performance/tests/test_perfRunner.py b/mlir/utils/performance/tests/test_perfRunner.py index 447c6e4076fe..6bb303b75974 100644 --- a/mlir/utils/performance/tests/test_perfRunner.py +++ b/mlir/utils/performance/tests/test_perfRunner.py @@ -8,6 +8,7 @@ import os import sys import tempfile +import types from pathlib import Path # Ensure we can import from parent (perfRunner lives in mlir/utils/performance) @@ -122,20 +123,49 @@ def test_read_nonexistent_returns_none(self): class TestGetNumChiplets: - """Tests for get_num_chiplets (pure logic, no GPU).""" + """Tests for get_num_chiplets (delegates to amd_arch_db).""" - def test_gfx942_304(self): - assert perfRunner.get_num_chiplets("gfx942", 304) == 8 + def test_default_is_one(self): + assert perfRunner.get_num_chiplets() == 1 - def test_gfx942_80(self): - assert perfRunner.get_num_chiplets("gfx942", 80) == 4 + def test_forwards_max_num_xcc(self, monkeypatch): + monkeypatch.setattr(perfRunner, "lookup_arch_info", + lambda arch: types.SimpleNamespace(max_num_xcc=8)) + assert perfRunner.get_num_chiplets() == 8 - def test_gfx950(self): - assert perfRunner.get_num_chiplets("gfx950", 228) == 8 + def test_passes_device_id(self, monkeypatch): + captured = {} - def test_default_one(self): - assert perfRunner.get_num_chiplets("gfx900", 64) == 1 - assert perfRunner.get_num_chiplets("gfx1030", 72) == 1 + def fake_lookup(arch): + captured["arch"] = arch + return types.SimpleNamespace(max_num_xcc=4) + + monkeypatch.setattr(perfRunner, "lookup_arch_info", fake_lookup) + assert perfRunner.get_num_chiplets(2) == 4 + assert captured["arch"] == "native:2" + + +class TestGetNumCu: + """Tests for get_num_cu (delegates to amd_arch_db).""" + + def test_default_is_mock_value(self): + assert perfRunner.get_num_cu() == 64 + + def test_forwards_min_num_cu(self, monkeypatch): + monkeypatch.setattr(perfRunner, "lookup_arch_info", + lambda arch: types.SimpleNamespace(min_num_cu=304)) + assert perfRunner.get_num_cu() == 304 + + def test_passes_device_id(self, monkeypatch): + captured = {} + + def fake_lookup(arch): + captured["arch"] = arch + return types.SimpleNamespace(min_num_cu=80) + + monkeypatch.setattr(perfRunner, "lookup_arch_info", fake_lookup) + assert perfRunner.get_num_cu(1) == 80 + assert captured["arch"] == "native:1" class TestParseDataTypes: diff --git a/mlir/utils/performance/tuningRunner.py b/mlir/utils/performance/tuningRunner.py index 78699bb0ef92..a7ad60ce8d1d 100755 --- a/mlir/utils/performance/tuningRunner.py +++ b/mlir/utils/performance/tuningRunner.py @@ -1957,8 +1957,8 @@ def main(args=None): arch = perfRunner.get_arch() chip = perfRunner.get_chip() - num_cu = perfRunner.get_num_cu(chip) - num_chiplets = perfRunner.get_num_chiplets(chip, num_cu) + num_cu = perfRunner.get_num_cu() + num_chiplets = perfRunner.get_num_chiplets() # Handle stdin for configs file stdin_temp_file = None