Skip to content

Commit ad41f17

Browse files
Merge branch 'develop' into users/spolifroni-amd/ck-remove-api-reference
2 parents b173eb1 + a07c8e3 commit ad41f17

188 files changed

Lines changed: 6382 additions & 2241 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,13 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
88
* Added preshuffleB support for abquant mode in blockscale GEMM.
99
* Added support for explicit GEMM in CK_TILE grouped convolution forward and backward weight.
1010
* Added TF32 convolution support on gfx942 and gfx950 in CK. It could be enabled/disabled via `DTYPES` of "tf32".
11-
* Added attention sink support for FMHA FWD, include qr_ks_vs, qr_async and splitkv pipelines.
11+
* Added streamingllm sink support for FMHA FWD, include qr_ks_vs, qr_async and splitkv pipelines.
1212
* Added support for microscaling (MX) FP8/FP4 mixed data types to Flatmm pipeline.
1313
* Added support for fp8 dynamic tensor-wise quantization of fp8 fmha fwd kernel.
1414
* Added FP8 KV cache support for FMHA batch prefill.
1515
* Added support for gfx1153 target.
1616
* Added FMHA batch prefill kernel support for several KV cache layouts, flexible page sizes, and different lookup table configurations.
17+
* Added gpt-oss sink support for FMHA FWD, include qr_ks_vs, qr_async, qr_async_trload and splitkv pipelines.
1718

1819
### Changed
1920

CMakeLists.txt

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,12 @@ endif()
3131
# Default installation path
3232
if(NOT WIN32)
3333
set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "")
34+
else()
35+
set(CMAKE_INSTALL_PREFIX "C:/dist/TheRock" CACHE PATH "")
3436
endif()
3537

3638
set(version 1.2.0)
37-
# Check support for CUDA/HIP in Cmake
38-
project(composable_kernel VERSION ${version} LANGUAGES CXX HIP)
39+
project(composable_kernel VERSION ${version} LANGUAGES CXX)
3940
include(CTest)
4041

4142
option(ENABLE_CLANG_CPP_CHECKS "Enables clang tidy, cppcheck" ON)
@@ -162,7 +163,13 @@ execute_process(COMMAND "${GIT_EXECUTABLE}" rev-parse HEAD OUTPUT_VARIABLE COMMI
162163
configure_file(include/ck/version.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/ck/version.h)
163164

164165
set(ROCM_SYMLINK_LIBS OFF)
165-
find_package(ROCM REQUIRED PATHS /opt/rocm)
166+
167+
if (WIN32)
168+
find_package(ROCmCMakeBuildTools REQUIRED PATHS C:/dist/TheRock)
169+
set(HIP_PLATFORM "amd" CACHE STRING "HIP platform")
170+
else()
171+
find_package(ROCM REQUIRED PATHS /opt/rocm)
172+
endif()
166173

167174
include(ROCMInstallTargets)
168175
include(ROCMPackageConfigHelpers)
@@ -189,7 +196,10 @@ if(GPU_TARGETS)
189196
else()
190197
set(USER_GPU_TARGETS 0)
191198
endif()
199+
192200
find_package(hip REQUIRED)
201+
enable_language(HIP)
202+
193203
# No assumption that HIP kernels are launched with uniform block size for backward compatibility
194204
# SWDEV-413293 and https://reviews.llvm.org/D155213
195205
math(EXPR hip_VERSION_FLAT "(${hip_VERSION_MAJOR} * 1000 + ${hip_VERSION_MINOR}) * 100000 + ${hip_VERSION_PATCH}")

Jenkinsfile

Lines changed: 17 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,8 @@ def cmake_build(Map conf=[:]){
574574
def setup_cmd
575575
def build_cmd
576576
def execute_cmd = conf.get("execute_cmd", "")
577+
//check the node gpu architecture
578+
def arch_name = check_arch_name()
577579
if(!setup_args.contains("NO_CK_BUILD")){
578580
if (params.NINJA_BUILD_TRACE) {
579581
echo "running ninja build trace"
@@ -646,15 +648,15 @@ def cmake_build(Map conf=[:]){
646648

647649
//run tests except when NO_CK_BUILD or BUILD_LEGACY_OS are set
648650
if(!setup_args.contains("NO_CK_BUILD") && !params.BUILD_LEGACY_OS){
649-
sh "python3 ../script/ninja_json_converter.py .ninja_log --legacy-format --output ck_build_trace_${check_arch_name()}.json"
650-
archiveArtifacts "ck_build_trace_${check_arch_name()}.json"
651-
sh "python3 ../script/parse_ninja_trace.py ck_build_trace_${check_arch_name()}.json"
651+
sh "python3 ../script/ninja_json_converter.py .ninja_log --legacy-format --output ck_build_trace_${arch_name}.json"
652+
archiveArtifacts "ck_build_trace_${arch_name}.json"
653+
sh "python3 ../script/parse_ninja_trace.py ck_build_trace_${arch_name}.json"
652654
if (params.NINJA_BUILD_TRACE || params.BUILD_INSTANCES_ONLY){
653655
if (params.NINJA_FTIME_TRACE) {
654656
echo "running ClangBuildAnalyzer"
655657
sh "/ClangBuildAnalyzer/build/ClangBuildAnalyzer --all . clang_build.log"
656-
sh "/ClangBuildAnalyzer/build/ClangBuildAnalyzer --analyze clang_build.log > clang_build_analysis_${check_arch_name()}.log"
657-
archiveArtifacts "clang_build_analysis_${check_arch_name()}.log"
658+
sh "/ClangBuildAnalyzer/build/ClangBuildAnalyzer --analyze clang_build.log > clang_build_analysis_${arch_name}.log"
659+
archiveArtifacts "clang_build_analysis_${arch_name}.log"
658660
}
659661

660662

@@ -672,8 +674,8 @@ def cmake_build(Map conf=[:]){
672674
if(params.BUILD_PACKAGES){
673675
echo "Build ckProfiler packages"
674676
sh 'ninja -j64 package'
675-
sh "mv composablekernel-ckprofiler_*.deb composablekernel-ckprofiler_1.2.0_amd64_${check_arch_name()}.deb"
676-
stash includes: "composablekernel-ckprofiler**.deb", name: "profiler_package_${check_arch_name()}"
677+
sh "mv composablekernel-ckprofiler_*.deb composablekernel-ckprofiler_1.2.0_amd64_${arch_name}.deb"
678+
stash includes: "composablekernel-ckprofiler**.deb", name: "profiler_package_${arch_name}"
677679
}
678680
}
679681
if(params.BUILD_INSTANCES_ONLY){
@@ -699,16 +701,14 @@ def cmake_build(Map conf=[:]){
699701
if(params.BUILD_PACKAGES){
700702
echo "Build ckProfiler packages"
701703
sh 'ninja -j64 package'
702-
sh "mv composablekernel-ckprofiler_*.deb composablekernel-ckprofiler_1.2.0_amd64_${check_arch_name()}.deb"
703-
stash includes: "composablekernel-ckprofiler**.deb", name: "profiler_package_${check_arch_name()}"
704+
sh "mv composablekernel-ckprofiler_*.deb composablekernel-ckprofiler_1.2.0_amd64_${arch_name}.deb"
705+
stash includes: "composablekernel-ckprofiler**.deb", name: "profiler_package_${arch_name}"
704706
}
705707
}
706708
}
707709
}
708710
}
709711

710-
//check the node gpu architecture
711-
def arch_name = check_arch_name()
712712
if (params.RUN_CK_TILE_FMHA_TESTS){
713713
try{
714714
archiveArtifacts "perf_fmha_*.log"
@@ -1201,8 +1201,8 @@ pipeline {
12011201
description: "Run the ck_tile FMHA tests (default: OFF)")
12021202
booleanParam(
12031203
name: "RUN_TILE_ENGINE_BASIC_TESTS",
1204-
defaultValue: false,
1205-
description: "Run the tile_engine_basic tests (default: OFF)")
1204+
defaultValue: true,
1205+
description: "Run the tile_engine_basic tests (default: ON)")
12061206
booleanParam(
12071207
name: "RUN_TILE_ENGINE_GEMM_TESTS",
12081208
defaultValue: false,
@@ -1650,7 +1650,10 @@ pipeline {
16501650
-D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \
16511651
-D GEMM_PRESHUFFLE_LAYOUT="rcr" \
16521652
-D GEMM_PRESHUFFLE_CONFIG_FILE="default_ci_config.json" .. && \
1653-
ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all """
1653+
ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all && \
1654+
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
1655+
python3 ../tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
1656+
python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
16541657
}
16551658
steps{
16561659
buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
@@ -1667,37 +1670,6 @@ pipeline {
16671670
}
16681671
parallel
16691672
{
1670-
stage("Run TILE_ENGINE_GEMM Tests on gfx90a")
1671-
{
1672-
when {
1673-
beforeAgent true
1674-
expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() }
1675-
}
1676-
agent{ label rocmnode("gfx90a") }
1677-
environment{
1678-
setup_args = "NO_CK_BUILD"
1679-
execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
1680-
-D CMAKE_CXX_COMPILER="${params.BUILD_COMPILER}" \
1681-
-D CMAKE_BUILD_TYPE=Release \
1682-
-D GPU_TARGETS="gfx90a" \
1683-
-D GEMM_UNIVERSAL_DATATYPE="fp8;fp16" \
1684-
-D GEMM_UNIVERSAL_LAYOUT="rcr;rrr;crr;ccr" \
1685-
-D GEMM_STREAMK_DATATYPE="fp8;fp16" \
1686-
-D GEMM_STREAMK_LAYOUT="rcr" \
1687-
-D GEMM_MULTI_D_DATATYPE="fp16" \
1688-
-D GEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
1689-
-D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \
1690-
-D GEMM_PRESHUFFLE_LAYOUT="rcr" .. && \
1691-
ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all benchmark_gemm_streamk_all && \
1692-
python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
1693-
python3 ../tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
1694-
python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
1695-
}
1696-
steps{
1697-
buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
1698-
cleanWs()
1699-
}
1700-
}
17011673
stage("Run TILE_ENGINE_GEMM Tests on gfx942")
17021674
{
17031675
when {

README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,22 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa
137137
```
138138
**[See Note on -j](#notes)**
139139
140+
### Building for Windows
141+
142+
Install TheRock and run CMake configure as
143+
144+
```bash
145+
cmake \
146+
-D CMAKE_PREFIX_PATH="C:/dist/TheRock" \
147+
-D CMAKE_CXX_COMPILER="C:/dist/TheRock/bin/hipcc.exe" \
148+
-D CMAKE_BUILD_TYPE=Release \
149+
-D GPU_TARGETS="gfx1151" \
150+
-G Ninja \
151+
..
152+
```
153+
154+
Use Ninja to build either the whole library or individual targets.
155+
140156
## Optional post-install steps
141157
142158
* Build examples and tests:

example/CMakeLists.txt

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,35 @@ include_directories(BEFORE
66
${PROJECT_SOURCE_DIR}/library/include
77
)
88

9+
if(WIN32)
10+
# On Windows, HIP uses -nostdlib which prevents C runtime linking
11+
# We need legacy_stdio_definitions.lib to provide vfprintf and other legacy C functions
12+
# This is mainly needed for the getopt library.
13+
set(LEGACY_STDIO_SEARCH_PATHS)
14+
15+
# Try to use Visual C++ Tools environment variable (if build executes from Visual Studio Developer Command Prompt)
16+
if(DEFINED ENV{VCToolsInstallDir})
17+
list(APPEND LEGACY_STDIO_SEARCH_PATHS "$ENV{VCToolsInstallDir}/lib/x64")
18+
endif()
19+
20+
# Fallback: Search common Visual Studio installation locations
21+
file(GLOB MSVC_LIB_PATHS "C:/Program Files/Microsoft Visual Studio/*/*/VC/Tools/MSVC/*/lib/x64")
22+
list(APPEND LEGACY_STDIO_SEARCH_PATHS ${MSVC_LIB_PATHS})
23+
24+
# Use find_library to locate the library
25+
find_library(LEGACY_STDIO_LIB legacy_stdio_definitions
26+
PATHS ${LEGACY_STDIO_SEARCH_PATHS}
27+
NO_DEFAULT_PATH
28+
)
29+
30+
if(LEGACY_STDIO_LIB)
31+
message(STATUS "Found legacy_stdio_definitions.lib: ${LEGACY_STDIO_LIB}")
32+
add_link_options("SHELL:-Xlinker \"${LEGACY_STDIO_LIB}\"")
33+
else()
34+
message(WARNING "Could not find legacy_stdio_definitions.lib - examples may fail to link.")
35+
endif()
36+
endif()
37+
938
add_custom_target(examples)
1039

1140

@@ -216,6 +245,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
216245
set_source_files_properties(${FILE_NAME} PROPERTIES LANGUAGE HIP)
217246
add_executable(${EXAMPLE_NAME} ${FILE_NAME})
218247
target_link_libraries(${EXAMPLE_NAME} PRIVATE utility)
248+
target_link_libraries(${EXAMPLE_NAME} PRIVATE getopt::getopt)
219249
add_dependencies(examples ${EXAMPLE_NAME})
220250
set_property(TARGET ${EXAMPLE_NAME} PROPERTY HIP_ARCHITECTURES ${EX_TARGETS})
221251
rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples)

example/ck_tile/01_fmha/example_fmha_fwd.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,8 @@ auto create_args(int argc, char* argv[])
114114
.insert("kv_eff_lens",
115115
"",
116116
"Batch-mode only: per-batch effective seqlen for KV (exclude PAD).\n"
117-
"Comma-separated list of length 'b'. If empty, no override.");
117+
"Comma-separated list of length 'b'. If empty, no override.")
118+
.insert("init_sink", "0", "value to init the output tensor sink value for validation");
118119

119120
bool result = arg_parser.parse(argc, argv);
120121
return std::make_tuple(result, arg_parser);
@@ -157,6 +158,7 @@ auto run(const ck_tile::ArgParser& arg_parser)
157158
ck_tile::index_t num_splits = arg_parser.get_int("num_splits");
158159
std::string init_method = arg_parser.get_str("init");
159160
uint32_t seed = arg_parser.get_uint32("seed");
161+
int init_sink_value = arg_parser.get_int("init_sink");
160162

161163
ck_tile::stream_config stream_config{nullptr,
162164
true,
@@ -203,6 +205,7 @@ auto run(const ck_tile::ArgParser& arg_parser)
203205
init_method,
204206
seed,
205207
do_validation,
208+
init_sink_value,
206209
stream_config,
207210
json);
208211
}

0 commit comments

Comments
 (0)