From d43a866f3b3b0b657b0b4bfb8a8dc282d985bc90 Mon Sep 17 00:00:00 2001 From: intwanghao Date: Tue, 22 Apr 2025 11:30:01 +0800 Subject: [PATCH 1/4] fix Signed-off-by: intwanghao --- .../lib/DPCT/RulesLangLib/CUBAPIMigration.cpp | 3 +- .../dpct/detail/group_utils_detail.hpp | 17 +++++- .../dpct-rt/include/dpct/group_utils.hpp | 14 +++-- .../dpct/cub/blocklevel/blockradixsort.cu | 60 ++++++++++++++++++- 4 files changed, 84 insertions(+), 10 deletions(-) diff --git a/clang/lib/DPCT/RulesLangLib/CUBAPIMigration.cpp b/clang/lib/DPCT/RulesLangLib/CUBAPIMigration.cpp index 45ef28576dc4..aa3417c0b0df 100644 --- a/clang/lib/DPCT/RulesLangLib/CUBAPIMigration.cpp +++ b/clang/lib/DPCT/RulesLangLib/CUBAPIMigration.cpp @@ -285,11 +285,10 @@ void CubMemberCallRule::runRule( CanTy->getAs()->getDecl()); const auto &ValueTyArg = ClassSpecDecl->getTemplateArgs()[0]; - ValueTyArg.getAsType().getAsString(); std::string Fn; llvm::raw_string_ostream OS(Fn); OS << MapNames::getDpctNamespace() << "group::" << HelpFuncName << "<" - << ValueTyArg.getAsType().getAsString(); + << DpctGlobalInfo::getReplacedTypeName(ValueTyArg.getAsType()); if (isBlockShuffle) { if (!ClassSpecDecl->getTemplateArgs()[1].getIsDefaulted()) { OS << ", " << ClassSpecDecl->getTemplateArgs()[1].getAsIntegral(); diff --git a/clang/runtime/dpct-rt/include/dpct/detail/group_utils_detail.hpp b/clang/runtime/dpct-rt/include/dpct/detail/group_utils_detail.hpp index aaac5ccae86a..db53890fc77f 100644 --- a/clang/runtime/dpct-rt/include/dpct/detail/group_utils_detail.hpp +++ b/clang/runtime/dpct-rt/include/dpct/detail/group_utils_detail.hpp @@ -36,9 +36,9 @@ template class radix_rank { radix_rank(uint8_t *local_memory) : _local_memory(local_memory) {} - template + template __dpct_inline__ void - rank_keys(const Item &item, uint32_t (&keys)[VALUES_PER_THREAD], + rank_keys(const Item &item, KT (&keys)[VALUES_PER_THREAD], int (&ranks)[VALUES_PER_THREAD], int current_bit, int num_bits) { digit_counter_type thread_prefixes[VALUES_PER_THREAD]; @@ -204,10 +204,23 @@ template struct base_traits { } }; +template struct base_traits { + static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1); + static __dpct_inline__ U twiddle_in(U key) { + U mask = (key & HIGH_BIT) ? U(-1) : HIGH_BIT; + return key ^ mask; + } + static __dpct_inline__ U twiddle_out(U key) { + U mask = (key & HIGH_BIT) ? HIGH_BIT : U(-1); + return key ^ mask; + } +}; + template struct traits : base_traits {}; template <> struct traits : base_traits {}; template <> struct traits : base_traits {}; template <> struct traits : base_traits {}; +template <> struct traits : base_traits {}; template struct power_of_two { enum { VALUE = ((N & (N - 1)) == 0) }; diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp index a05182f50847..2e90b02b2993 100644 --- a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp @@ -360,14 +360,18 @@ class group_radix_sort { } private: + template + using key_type = + typename std::conditional::value, uint16_t, + uint32_t>::type; template __dpct_inline__ void helper_sort(const Item &item, T (&keys)[ElementsPerWorkItem], int begin_bit = 0, int end_bit = 8 * sizeof(T), bool is_striped = false) { - - uint32_t(&unsigned_keys)[ElementsPerWorkItem] = - reinterpret_cast(keys); + using UnsignedT = key_type; + UnsignedT(&unsigned_keys)[ElementsPerWorkItem] = + reinterpret_cast(keys); #pragma unroll for (int i = 0; i < ElementsPerWorkItem; ++i) { @@ -379,8 +383,8 @@ class group_radix_sort { int ranks[ElementsPerWorkItem]; detail::radix_rank(_local_memory) - .template rank_keys(item, unsigned_keys, - ranks, i, pass_bits); + .template rank_keys( + item, unsigned_keys, ranks, i, pass_bits); sycl::group_barrier(item.get_group()); diff --git a/clang/test/dpct/cub/blocklevel/blockradixsort.cu b/clang/test/dpct/cub/blocklevel/blockradixsort.cu index 0b54bb50f7c6..ba4b8d55499a 100644 --- a/clang/test/dpct/cub/blocklevel/blockradixsort.cu +++ b/clang/test/dpct/cub/blocklevel/blockradixsort.cu @@ -30,6 +30,28 @@ __global__ void Sort(int *data) { BlockStore(temp_storage_store).Store(data, thread_keys); } +__global__ void SortHalf(__half *data) { + // CHECK: using BlockRadixSort = dpct::group::group_radix_sort; + // CHECK-NEXT: using BlockLoad = dpct::group::group_load; + // CHECK-NEXT: using BlockStore = dpct::group::group_store; + // CHECK-NOT: __shared__ typename BlockLoad::TempStorage temp_storage_load; + // CHECK-NOT: __shared__ typename BlockStore::TempStorage temp_storage_store; + // CHECK-NOT: __shared__ typename BlockRadixSort::TempStorage temp_storage; + using BlockRadixSort = cub::BlockRadixSort<__half, 128, 4>; + using BlockLoad = cub::BlockLoad<__half, 128, 4>; + using BlockStore = cub::BlockStore<__half, 128, 4>; + __shared__ typename BlockLoad::TempStorage temp_storage_load; + __shared__ typename BlockStore::TempStorage temp_storage_store; + __shared__ typename BlockRadixSort::TempStorage temp_storage; + __half thread_keys[4]; + // CHECK: BlockLoad(temp_storage_load).load(item_ct1, data, thread_keys); + // CHECK-NEXT: BlockRadixSort(temp_storage).sort(item_ct1, thread_keys); + // CHECK-NEXT: BlockStore(temp_storage_store).store(item_ct1, data, thread_keys); + BlockLoad(temp_storage_load).Load(data, thread_keys); + BlockRadixSort(temp_storage).Sort(thread_keys); + BlockStore(temp_storage_store).Store(data, thread_keys); +} + __global__ void SortDescending(int *data) { // CHECK: using BlockRadixSort = dpct::group::group_radix_sort; // CHECK-NEXT: using BlockLoad = dpct::group::group_load; @@ -211,6 +233,42 @@ bool test_sort() { return true; } +bool test_sorthalf() { + __half data[512] = {0}, *d_data = nullptr; + cudaMalloc(&d_data, sizeof(__half) * 512); + for (int i = 0, x = 0, y = 511; i < 128; ++i) { + data[i * 4 + 0] = x++; + data[i * 4 + 1] = y--; + data[i * 4 + 2] = x++; + data[i * 4 + 3] = y--; + } + cudaMemcpy(d_data, data, sizeof(data), cudaMemcpyHostToDevice); + // CHECK: q_ct1.submit( + // CHECK-NEXT: [&](sycl::handler &cgh) { + // CHECK-NEXT: sycl::local_accessor temp_storage_load_acc(dpct::group::group_load::get_local_memory_size(sycl::range<3>(1, 1, 128).size()), cgh); + // CHECK-NEXT: sycl::local_accessor temp_storage_store_acc(dpct::group::group_store::get_local_memory_size(sycl::range<3>(1, 1, 128).size()), cgh); + // CHECK-NEXT: sycl::local_accessor temp_storage_acc(dpct::group::group_radix_sort::get_local_memory_size(sycl::range<3>(1, 1, 128).size()), cgh); + // CHECK-EMPTY: + // CHECK-NEXT: cgh.parallel_for( + // CHECK-NEXT: sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), + // CHECK-NEXT: [=](sycl::nd_item<3> item_ct1) { + // CHECK-NEXT: SortHalf(d_data, item_ct1, &temp_storage_load_acc[0], &temp_storage_store_acc[0], &temp_storage_acc[0]); + // CHECK-NEXT: }); + // CHECK-NEXT: }); + SortHalf<<<1, 128>>>(d_data); + cudaDeviceSynchronize(); + cudaMemcpy(data, d_data, sizeof(data), cudaMemcpyDeviceToHost); + cudaFree(d_data); + for (int i = 0; i < 512; ++i) + if ((int)data[i] != i) { + printf("test_sorthalf failed\n"); + print_array((int)data); + return false; + } + printf("test_sorthalf pass\n"); + return true; +} + bool test_sort_descending() { int data[512] = {0}, *d_data = nullptr; cudaMalloc(&d_data, sizeof(int) * 512); @@ -610,7 +668,7 @@ bool test_sort_descending_blocked_to_striped_bit() { } int main() { - return !(test_sort() && test_sort_descending() && + return !(test_sort() && test_sorthalf() && test_sort_descending() && test_sort_blocked_to_striped() && test_sort_descending_blocked_to_striped() && test_sort_bit() && test_sort_descending_bit() && test_sort_blocked_to_striped_bit() && From 9b51b632eab01d4b0f53e7921046d8b7a7a340ff Mon Sep 17 00:00:00 2001 From: intwanghao Date: Wed, 23 Apr 2025 09:22:26 +0800 Subject: [PATCH 2/4] fix Signed-off-by: intwanghao --- clang/test/dpct/cub/blocklevel/blockradixsort.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/clang/test/dpct/cub/blocklevel/blockradixsort.cu b/clang/test/dpct/cub/blocklevel/blockradixsort.cu index ba4b8d55499a..2e30134fb0bf 100644 --- a/clang/test/dpct/cub/blocklevel/blockradixsort.cu +++ b/clang/test/dpct/cub/blocklevel/blockradixsort.cu @@ -193,8 +193,9 @@ __global__ void test_unsupported(int *data) { template void print_array(T (&arr)[N]) { - for (int i = 0; i < N; ++i) - printf("%d%c", arr[i], (i == N - 1 ? '\n' : ',')); + for (int i = 0; i < N; ++i) { + std::cout << arr[i] << (i == N - 1 ? '\n' : ','); + } } bool test_sort() { @@ -262,7 +263,7 @@ bool test_sorthalf() { for (int i = 0; i < 512; ++i) if ((int)data[i] != i) { printf("test_sorthalf failed\n"); - print_array((int)data); + print_array(data); return false; } printf("test_sorthalf pass\n"); From cc055d5ae47e1a7ab197a9dae9422aa97686e158 Mon Sep 17 00:00:00 2001 From: intwanghao Date: Tue, 29 Apr 2025 10:21:38 +0800 Subject: [PATCH 3/4] fix Signed-off-by: intwanghao --- clang/runtime/dpct-rt/include/dpct/group_utils.hpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp index 2e90b02b2993..804ebfc3cac2 100644 --- a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp @@ -360,16 +360,14 @@ class group_radix_sort { } private: - template - using key_type = - typename std::conditional::value, uint16_t, - uint32_t>::type; template __dpct_inline__ void helper_sort(const Item &item, T (&keys)[ElementsPerWorkItem], int begin_bit = 0, int end_bit = 8 * sizeof(T), bool is_striped = false) { - using UnsignedT = key_type; + using UnsignedT = + typename std::conditional::value, uint16_t, + uint32_t>::type; UnsignedT(&unsigned_keys)[ElementsPerWorkItem] = reinterpret_cast(keys); From 26bb841a006f419005a26292e743048be94dc85c Mon Sep 17 00:00:00 2001 From: intwanghao Date: Tue, 29 Apr 2025 12:57:23 +0800 Subject: [PATCH 4/4] fix Signed-off-by: intwanghao --- clang/test/dpct/cub/blocklevel/blockradixsort.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/test/dpct/cub/blocklevel/blockradixsort.cu b/clang/test/dpct/cub/blocklevel/blockradixsort.cu index 9cb1a37c831f..6db1f27eb52d 100644 --- a/clang/test/dpct/cub/blocklevel/blockradixsort.cu +++ b/clang/test/dpct/cub/blocklevel/blockradixsort.cu @@ -194,7 +194,7 @@ __global__ void test_unsupported(int *data) { template void print_array(T (&arr)[N]) { for (int i = 0; i < N; ++i) { - std::cout << arr[i] << (i == N - 1 ? '\n' : ','); + std::cout << (int)arr[i] << (i == N - 1 ? '\n' : ','); } } @@ -253,7 +253,7 @@ bool test_sorthalf() { // CHECK-NEXT: cgh.parallel_for( // CHECK-NEXT: sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), // CHECK-NEXT: [=](sycl::nd_item<3> item_ct1) { - // CHECK-NEXT: SortHalf(d_data, item_ct1, &temp_storage_load_acc[0], &temp_storage_store_acc[0], &temp_storage_acc[0]); + // CHECK-NEXT: SortHalf(d_data, &temp_storage_load_acc[0], &temp_storage_store_acc[0], &temp_storage_acc[0]); // CHECK-NEXT: }); // CHECK-NEXT: }); SortHalf<<<1, 128>>>(d_data);