Skip to content

Commit 7dcc606

Browse files
[rocm-libraries] ROCm/rocm-libraries#5383 (commit b660b8c)
[CK_TILE] Add CShuffleLds microbenchmark suite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Microbenchmarks isolating LDS store/load operations in CShuffleEpilogue for bank conflict analysis. ## Motivation CShuffleEpilogue performs LDS store (MFMA registers → LDS) and load (LDS → registers for coalesced global writes). This suite isolates each operation to: - Identify which operation causes bank conflicts - Measure pure LDS bandwidth per access pattern - Validate access patterns across MFMA tile sizes and wave layouts ## Components - **Microkernels** (`tile_load_store_microkernels.hpp`): `StoreTile<Setup>`, `LoadTile<Setup>` - **Setup Adapters** (`benchmark_cshuffle_lds.hpp`): Wire CShuffleEpilogue to microkernels - **Template** (`benchmark_template.cpp.in`): Generated benchmarks with timing ## Build ```bash cmake -G Ninja -B build -S . \ -DGPU_TARGETS=gfx950 \ -DBUILD_CK_EXAMPLES=ON \ -DBUILD_CK_TILE_CSHUFFLE_LDS_BENCHMARKS=ON ninja -C build bench_lds_fp8_16x16x128_2x2_fp8 ``` ## New CMake Options | Option | Default | Description | |--------|---------|-------------| | `BUILD_CK_TILE_CSHUFFLE_LDS_BENCHMARKS` | OFF | LDS microbenchmarks | | `BUILD_CK_TILE_FMHA_TESTS` | ON | FMHA tests | | `BUILD_CK_TILE_ENGINE` | ON | Tile engine | | `BUILD_CK_TILE_ENGINE_TESTS` | ON | Tile engine tests | | `BUILD_CK_EXAMPLES` | ON | Examples | | `BUILD_CK_TUTORIALS` | ON | Tutorials | | `BUILD_CK_DEVICE_INSTANCES` | ON | Device instances | | `BUILD_CK_PROFILER` | ON | Profiler | Setting guards to OFF reduces cmake configure from ~150s to ~5s.
1 parent 5348b57 commit 7dcc606

11 files changed

Lines changed: 631 additions & 76 deletions

File tree

CMakeLists.txt

Lines changed: 87 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ option(CK_EXPERIMENTAL_BUILDER "Enable experimental builder" OFF)
5252
option(BUILD_MHA_LIB "Build the static library for flash attention" OFF)
5353
option(FORCE_DISABLE_XDL "Skip compiling XDL specific instances (even if supported GPUs are included in GPU_TARGETS)" OFF)
5454
option(FORCE_DISABLE_WMMA "Skip compiling WMMA specific instances (even if supported GPUs are included in GPU_TARGETS)" OFF)
55+
option(BUILD_CK_TILE_ENGINE "Build the tile_engine subdirectory" ON)
56+
option(BUILD_CK_EXAMPLES "Build the example subdirectory" ON)
57+
option(BUILD_CK_TUTORIALS "Build the tutorial subdirectory" ON)
5558

5659
if(CK_EXPERIMENTAL_BUILDER)
5760
add_definitions(-DCK_EXPERIMENTAL_BUILDER)
@@ -668,94 +671,106 @@ if(NOT MIOPEN_REQ_LIBS_ONLY AND NOT HIPTENSOR_REQ_LIBS_ONLY)
668671
endif()
669672

670673

671-
672-
# Optimization: Search only in library/src where all instance files actually live
673-
# (was searching entire source tree, taking ~40s instead of <1s)
674-
file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/library/src/*/device_*_instance.cpp")
675-
file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*)
676-
set(CK_DEVICE_INSTANCES)
677-
FOREACH(subdir_path ${dir_list})
678-
set(target_dir)
679-
IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}")
680-
set(cmake_instance)
681-
file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance)
682-
set(add_inst 0)
683-
if(("${cmake_instance}" MATCHES "fp8" OR "${cmake_instance}" MATCHES "_f8") AND DTYPES MATCHES "fp8")
684-
set(add_inst 1)
685-
endif()
686-
if(("${cmake_instance}" MATCHES "bf8" OR "${cmake_instance}" MATCHES "_b8") AND DTYPES MATCHES "bf8")
687-
set(add_inst 1)
688-
endif()
689-
if(("${cmake_instance}" MATCHES "fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16")
690-
set(add_inst 1)
691-
endif()
692-
if(("${cmake_instance}" MATCHES "fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32")
693-
set(add_inst 1)
694-
endif()
695-
if(("${cmake_instance}" MATCHES "tf32" OR "${cmake_instance}" MATCHES "_tf32") AND DTYPES MATCHES "tf32")
696-
set(add_inst 1)
697-
endif()
698-
if(("${cmake_instance}" MATCHES "fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64")
699-
set(add_inst 1)
700-
endif()
701-
if(("${cmake_instance}" MATCHES "bf16" OR "${cmake_instance}" MATCHES "_b16") AND DTYPES MATCHES "bf16")
702-
set(add_inst 1)
703-
endif()
704-
if(("${cmake_instance}" MATCHES "int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8")
705-
set(add_inst 1)
706-
endif()
707-
if(NOT "${cmake_instance}" MATCHES "DTYPES")
708-
set(add_inst 1)
709-
endif()
710-
if(add_inst EQUAL 1 OR NOT DEFINED DTYPES)
711-
list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance)
712-
endif()
713-
ENDIF()
714-
ENDFOREACH()
715-
716-
add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES})
717-
718674
option(MIOPEN_REQ_LIBS_ONLY "Build only the MIOpen required libraries" OFF)
719675
option(HIPTENSOR_REQ_LIBS_ONLY "Build only the HipTensor required libraries" OFF)
720676
option(DISABLE_OFFLOAD_COMPRESS "Disable offload compress compiler flag when building instances" OFF)
721677
option(BUILD_MHA_LIB "Build the static library for flash attention" OFF)
722-
723-
add_subdirectory(library)
678+
option(BUILD_CK_DEVICE_INSTANCES "Build device operation instances in library/" ON)
679+
option(BUILD_CK_PROFILER "Build the CK profiler in profiler/" ON)
680+
option(BUILD_CK_TILE_ENGINE_TESTS "Build tile engine tests" ON)
681+
option(BUILD_CK_TILE_FMHA_TESTS "Build FMHA tests" ON)
682+
option(BUILD_CK_TILE_CSHUFFLE_LDS_BENCHMARKS "Build CShuffleLds microbenchmarks (requires BUILD_CK_EXAMPLES=ON)" OFF)
683+
684+
if(BUILD_CK_DEVICE_INSTANCES)
685+
# Optimization: Search only in library/src where all instance files actually live
686+
# (was searching entire source tree, taking ~40s instead of <1s)
687+
file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/library/src/*/device_*_instance.cpp")
688+
file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*)
689+
set(CK_DEVICE_INSTANCES)
690+
FOREACH(subdir_path ${dir_list})
691+
set(target_dir)
692+
IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}")
693+
set(cmake_instance)
694+
file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance)
695+
set(add_inst 0)
696+
if(("${cmake_instance}" MATCHES "fp8" OR "${cmake_instance}" MATCHES "_f8") AND DTYPES MATCHES "fp8")
697+
set(add_inst 1)
698+
endif()
699+
if(("${cmake_instance}" MATCHES "bf8" OR "${cmake_instance}" MATCHES "_b8") AND DTYPES MATCHES "bf8")
700+
set(add_inst 1)
701+
endif()
702+
if(("${cmake_instance}" MATCHES "fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16")
703+
set(add_inst 1)
704+
endif()
705+
if(("${cmake_instance}" MATCHES "fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32")
706+
set(add_inst 1)
707+
endif()
708+
if(("${cmake_instance}" MATCHES "tf32" OR "${cmake_instance}" MATCHES "_tf32") AND DTYPES MATCHES "tf32")
709+
set(add_inst 1)
710+
endif()
711+
if(("${cmake_instance}" MATCHES "fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64")
712+
set(add_inst 1)
713+
endif()
714+
if(("${cmake_instance}" MATCHES "bf16" OR "${cmake_instance}" MATCHES "_b16") AND DTYPES MATCHES "bf16")
715+
set(add_inst 1)
716+
endif()
717+
if(("${cmake_instance}" MATCHES "int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8")
718+
set(add_inst 1)
719+
endif()
720+
if(NOT "${cmake_instance}" MATCHES "DTYPES")
721+
set(add_inst 1)
722+
endif()
723+
if(add_inst EQUAL 1 OR NOT DEFINED DTYPES)
724+
list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance)
725+
endif()
726+
ENDIF()
727+
ENDFOREACH()
728+
729+
add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES})
730+
add_subdirectory(library)
731+
endif()
724732

725733
if (CK_EXPERIMENTAL_BUILDER)
726734
add_subdirectory(experimental/builder)
727735
add_subdirectory(experimental/grouped_convolution_tile_instances)
728736
endif()
729737

730738
if(NOT GPU_ARCHS AND USER_GPU_TARGETS AND NOT MIOPEN_REQ_LIBS_ONLY AND NOT HIPTENSOR_REQ_LIBS_ONLY)
731-
rocm_package_setup_component(tests
732-
LIBRARY_NAME composablekernel
733-
PACKAGE_NAME tests # Prevent -static suffix on package name
734-
)
735-
736-
rocm_package_setup_component(examples
737-
LIBRARY_NAME composablekernel
738-
PACKAGE_NAME examples
739-
)
740-
add_subdirectory(example)
741-
742-
add_subdirectory(tutorial)
743-
rocm_package_setup_component(tutorials
744-
LIBRARY_NAME composablekernel
745-
PACKAGE_NAME tutorials
746-
)
747-
add_subdirectory(tile_engine)
739+
if(BUILD_CK_EXAMPLES)
740+
rocm_package_setup_component(examples
741+
LIBRARY_NAME composablekernel
742+
PACKAGE_NAME examples
743+
)
744+
add_subdirectory(example)
745+
endif()
746+
747+
if(BUILD_CK_TUTORIALS)
748+
add_subdirectory(tutorial)
749+
rocm_package_setup_component(tutorials
750+
LIBRARY_NAME composablekernel
751+
PACKAGE_NAME tutorials
752+
)
753+
endif()
754+
if(BUILD_CK_TILE_ENGINE)
755+
add_subdirectory(tile_engine)
756+
endif()
748757
if(BUILD_TESTING)
758+
rocm_package_setup_component(tests
759+
LIBRARY_NAME composablekernel
760+
PACKAGE_NAME tests # Prevent -static suffix on package name
761+
)
749762
add_subdirectory(test)
750763
endif()
751764
endif()
752765

753-
if (NOT MIOPEN_REQ_LIBS_ONLY AND NOT HIPTENSOR_REQ_LIBS_ONLY)
754-
rocm_package_setup_component(profiler
755-
LIBRARY_NAME composablekernel
756-
PACKAGE_NAME ckprofiler
757-
)
758-
add_subdirectory(profiler)
766+
if(BUILD_CK_PROFILER)
767+
if (NOT MIOPEN_REQ_LIBS_ONLY AND NOT HIPTENSOR_REQ_LIBS_ONLY)
768+
rocm_package_setup_component(profiler
769+
LIBRARY_NAME composablekernel
770+
PACKAGE_NAME ckprofiler
771+
)
772+
add_subdirectory(profiler)
773+
endif()
759774
endif()
760775

761776
if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS))

CMakePresets.json

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,22 @@
5151
"GPU_TARGETS": "gfx908;gfx90a;gfx942"
5252
}
5353
},
54+
{
55+
"name": "dev-minimal",
56+
"binaryDir": "${sourceDir}/build",
57+
"displayName": "CK Dev - Minimal Build",
58+
"description": "Fast iteration build with minimal components (configure ~5s vs ~150s)",
59+
"inherits": ["dev"],
60+
"cacheVariables": {
61+
"BUILD_CK_DEVICE_INSTANCES": "OFF",
62+
"BUILD_CK_PROFILER": "OFF",
63+
"BUILD_CK_EXAMPLES": "OFF",
64+
"BUILD_CK_TUTORIALS": "OFF",
65+
"BUILD_CK_TILE_ENGINE": "OFF",
66+
"BUILD_CK_TILE_ENGINE_TESTS": "OFF",
67+
"BUILD_CK_TILE_FMHA_TESTS": "OFF"
68+
}
69+
},
5470
{
5571
"name": "dev-gfx908",
5672
"displayName": "CK Dev - gfx908",

README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,21 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
124124
../script/cmake-ck-dev.sh .. gfx90a -DCMAKE_BUILD_TYPE=Release
125125
```
126126
127+
**Fast iteration builds:**
128+
129+
For faster CMake configuration during development (~5s vs ~150s), use the `--minimal` flag to disable
130+
building device instances, profiler, examples, tutorials, and tests:
131+
132+
```bash
133+
../script/cmake-ck-dev.sh --minimal .. gfx90a
134+
```
135+
136+
You can also specify a custom preset:
137+
138+
```bash
139+
../script/cmake-ck-dev.sh --preset=dev-minimal .. gfx90a
140+
```
141+
127142
5. Build the entire CK library:
128143
129144
```bash
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
2+
# SPDX-License-Identifier: MIT
3+
4+
# CShuffleLds LDS store/load microbenchmark suite
5+
# Measures LDS bandwidth and bank conflicts for different MFMA configurations
6+
7+
set(GENERATED_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/generated")
8+
file(MAKE_DIRECTORY "${GENERATED_SOURCE_DIR}")
9+
10+
# Core function: generate and build a benchmark executable
11+
function(add_cshuffle_lds_benchmark NAME A_TYPE B_TYPE ACC_TYPE O_TYPE M N M_WAVE N_WAVE M_XDL N_XDL K_XDL CONFIG_NAME)
12+
set(GENERATED_SOURCE "${GENERATED_SOURCE_DIR}/${NAME}.cpp")
13+
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/benchmark_template.cpp.in" "${GENERATED_SOURCE}" @ONLY)
14+
set_source_files_properties(${GENERATED_SOURCE} PROPERTIES LANGUAGE HIP)
15+
add_executable(${NAME} ${GENERATED_SOURCE})
16+
set_property(TARGET ${NAME} PROPERTY HIP_ARCHITECTURES ${SUPPORTED_GPU_TARGETS})
17+
target_include_directories(${NAME} PRIVATE ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/test ${CMAKE_CURRENT_SOURCE_DIR})
18+
target_link_libraries(${NAME} PRIVATE hip::device)
19+
if(CK_USE_OCP_FP8)
20+
target_compile_options(${NAME} PRIVATE -DCK_TILE_USE_OCP_FP8)
21+
endif()
22+
endfunction()
23+
24+
# Type-specific wrappers (derive name and config from parameters)
25+
function(add_fp16_benchmark M N M_WAVE N_WAVE M_XDL N_XDL K_XDL)
26+
set(NAME "bench_lds_fp16_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}")
27+
set(CONFIG "FP16_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}")
28+
add_cshuffle_lds_benchmark(${NAME} "ck_tile::half_t" "ck_tile::half_t" "float" "ck_tile::half_t"
29+
${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL} ${CONFIG})
30+
endfunction()
31+
32+
function(add_fp8_fp16_benchmark M N M_WAVE N_WAVE M_XDL N_XDL K_XDL)
33+
set(NAME "bench_lds_fp8_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}_fp16")
34+
set(CONFIG "FP8_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}_fp16")
35+
add_cshuffle_lds_benchmark(${NAME} "ck_tile::fp8_t" "ck_tile::fp8_t" "float" "ck_tile::half_t"
36+
${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL} ${CONFIG})
37+
endfunction()
38+
39+
function(add_fp8_fp8_benchmark M N M_WAVE N_WAVE M_XDL N_XDL K_XDL)
40+
set(NAME "bench_lds_fp8_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}_fp8")
41+
set(CONFIG "FP8_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}_fp8")
42+
add_cshuffle_lds_benchmark(${NAME} "ck_tile::fp8_t" "ck_tile::fp8_t" "float" "ck_tile::fp8_t"
43+
${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL} ${CONFIG})
44+
endfunction()
45+
46+
function(add_fp32_benchmark M N M_WAVE N_WAVE M_XDL N_XDL K_XDL)
47+
set(NAME "bench_lds_fp32_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}")
48+
set(CONFIG "FP32_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}")
49+
add_cshuffle_lds_benchmark(${NAME} "float" "float" "float" "float"
50+
${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL} ${CONFIG})
51+
endfunction()
52+
53+
function(add_bf16_benchmark M N M_WAVE N_WAVE M_XDL N_XDL K_XDL)
54+
set(NAME "bench_lds_bf16_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}")
55+
set(CONFIG "BF16_${M_XDL}x${N_XDL}x${K_XDL}_${M_WAVE}x${N_WAVE}")
56+
add_cshuffle_lds_benchmark(${NAME} "ck_tile::bf16_t" "ck_tile::bf16_t" "float" "ck_tile::bf16_t"
57+
${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL} ${CONFIG})
58+
endfunction()
59+
60+
# Helper to add benchmarks for all wave layouts of a given MFMA tile
61+
# Block tile M = M_XDL * M_WAVE, N = N_XDL * N_WAVE (must be divisible, here we use single iteration)
62+
macro(add_benchmarks_for_mfma FUNC M_XDL N_XDL K_XDL)
63+
foreach(WAVE_LAYOUT "4;1" "2;2" "1;4")
64+
list(GET WAVE_LAYOUT 0 M_WAVE)
65+
list(GET WAVE_LAYOUT 1 N_WAVE)
66+
math(EXPR M "${M_XDL} * ${M_WAVE}")
67+
math(EXPR N "${N_XDL} * ${N_WAVE}")
68+
cmake_language(CALL ${FUNC} ${M} ${N} ${M_WAVE} ${N_WAVE} ${M_XDL} ${N_XDL} ${K_XDL})
69+
endforeach()
70+
endmacro()
71+
72+
#
73+
# FP32 benchmarks
74+
#
75+
# MFMA tiles: 32x32x4, 32x32x8, 16x16x4, 16x16x8, 16x16x16
76+
add_benchmarks_for_mfma(add_fp32_benchmark 32 32 4)
77+
add_benchmarks_for_mfma(add_fp32_benchmark 32 32 8)
78+
add_benchmarks_for_mfma(add_fp32_benchmark 16 16 4)
79+
add_benchmarks_for_mfma(add_fp32_benchmark 16 16 8)
80+
add_benchmarks_for_mfma(add_fp32_benchmark 16 16 16)
81+
82+
#
83+
# FP16 benchmarks
84+
#
85+
# MFMA tiles: 32x32x8, 32x32x16, 16x16x16, 4x64x16, 64x4x16
86+
add_benchmarks_for_mfma(add_fp16_benchmark 32 32 8)
87+
add_benchmarks_for_mfma(add_fp16_benchmark 32 32 16)
88+
add_benchmarks_for_mfma(add_fp16_benchmark 16 16 16)
89+
add_benchmarks_for_mfma(add_fp16_benchmark 4 64 16)
90+
add_benchmarks_for_mfma(add_fp16_benchmark 64 4 16)
91+
92+
#
93+
# FP8 -> FP16 benchmarks
94+
#
95+
# MFMA tiles: 32x32x16, 16x16x32
96+
add_benchmarks_for_mfma(add_fp8_fp16_benchmark 32 32 16)
97+
add_benchmarks_for_mfma(add_fp8_fp16_benchmark 16 16 32)
98+
99+
#
100+
# FP8 -> FP8 benchmarks
101+
#
102+
# MFMA tiles: 32x32x16, 16x16x32
103+
add_benchmarks_for_mfma(add_fp8_fp8_benchmark 32 32 16)
104+
add_benchmarks_for_mfma(add_fp8_fp8_benchmark 16 16 32)
105+
106+
#
107+
# gfx950-only configurations
108+
#
109+
if(SUPPORTED_GPU_TARGETS MATCHES "gfx950")
110+
# FP16: 16x16x32
111+
add_benchmarks_for_mfma(add_fp16_benchmark 16 16 32)
112+
113+
# BF16: 16x16x64 (gfx950-only, uses 16x16x32 base instruction)
114+
# Other BF16 tiles have same LDS behavior as FP16 since both are 2-byte types
115+
add_benchmarks_for_mfma(add_bf16_benchmark 16 16 64)
116+
117+
# FP8 -> FP16: 32x32x32, 32x32x64, 16x16x64, 16x16x128
118+
add_benchmarks_for_mfma(add_fp8_fp16_benchmark 32 32 32)
119+
add_benchmarks_for_mfma(add_fp8_fp16_benchmark 32 32 64)
120+
add_benchmarks_for_mfma(add_fp8_fp16_benchmark 16 16 64)
121+
add_benchmarks_for_mfma(add_fp8_fp16_benchmark 16 16 128)
122+
123+
# FP8 -> FP8: 32x32x32, 32x32x64, 16x16x64, 16x16x128
124+
add_benchmarks_for_mfma(add_fp8_fp8_benchmark 32 32 32)
125+
add_benchmarks_for_mfma(add_fp8_fp8_benchmark 32 32 64)
126+
add_benchmarks_for_mfma(add_fp8_fp8_benchmark 16 16 64)
127+
add_benchmarks_for_mfma(add_fp8_fp8_benchmark 16 16 128)
128+
endif()

0 commit comments

Comments
 (0)