Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions clang/lib/DPCT/RulesLangLib/CUBAPIMigration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,11 +285,10 @@ void CubMemberCallRule::runRule(
CanTy->getAs<RecordType>()->getDecl());
const auto &ValueTyArg = ClassSpecDecl->getTemplateArgs()[0];

ValueTyArg.getAsType().getAsString();
std::string Fn;
llvm::raw_string_ostream OS(Fn);
OS << MapNames::getDpctNamespace() << "group::" << HelpFuncName << "<"
<< ValueTyArg.getAsType().getAsString();
<< DpctGlobalInfo::getReplacedTypeName(ValueTyArg.getAsType());
if (isBlockShuffle) {
if (!ClassSpecDecl->getTemplateArgs()[1].getIsDefaulted()) {
OS << ", " << ClassSpecDecl->getTemplateArgs()[1].getAsIntegral();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ template <int RADIX_BITS, bool DESCENDING = false> class radix_rank {

radix_rank(uint8_t *local_memory) : _local_memory(local_memory) {}

template <typename Item, int VALUES_PER_THREAD>
template <typename Item, typename KT, int VALUES_PER_THREAD>
__dpct_inline__ void
rank_keys(const Item &item, uint32_t (&keys)[VALUES_PER_THREAD],
rank_keys(const Item &item, KT (&keys)[VALUES_PER_THREAD],
int (&ranks)[VALUES_PER_THREAD], int current_bit, int num_bits) {

digit_counter_type thread_prefixes[VALUES_PER_THREAD];
Expand Down Expand Up @@ -204,10 +204,23 @@ template <typename U> struct base_traits<float, U> {
}
};

template <typename U> struct base_traits<sycl::half, U> {
static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1);
static __dpct_inline__ U twiddle_in(U key) {
U mask = (key & HIGH_BIT) ? U(-1) : HIGH_BIT;
return key ^ mask;
}
static __dpct_inline__ U twiddle_out(U key) {
U mask = (key & HIGH_BIT) ? HIGH_BIT : U(-1);
return key ^ mask;
}
};

template <typename T> struct traits : base_traits<T, T> {};
template <> struct traits<uint32_t> : base_traits<uint32_t, uint32_t> {};
template <> struct traits<int> : base_traits<int, uint32_t> {};
template <> struct traits<float> : base_traits<float, uint32_t> {};
template <> struct traits<sycl::half> : base_traits<sycl::half, uint16_t> {};

template <int N> struct power_of_two {
enum { VALUE = ((N & (N - 1)) == 0) };
Expand Down
12 changes: 7 additions & 5 deletions clang/runtime/dpct-rt/include/dpct/group_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,9 +365,11 @@ class group_radix_sort {
helper_sort(const Item &item, T (&keys)[ElementsPerWorkItem],
int begin_bit = 0, int end_bit = 8 * sizeof(T),
bool is_striped = false) {

uint32_t(&unsigned_keys)[ElementsPerWorkItem] =
reinterpret_cast<uint32_t(&)[ElementsPerWorkItem]>(keys);
using UnsignedT =
typename std::conditional<std::is_same<T, sycl::half>::value, uint16_t,
uint32_t>::type;
UnsignedT(&unsigned_keys)[ElementsPerWorkItem] =
reinterpret_cast<UnsignedT(&)[ElementsPerWorkItem]>(keys);

#pragma unroll
for (int i = 0; i < ElementsPerWorkItem; ++i) {
Expand All @@ -379,8 +381,8 @@ class group_radix_sort {

int ranks[ElementsPerWorkItem];
detail::radix_rank<RADIX_BITS, DESCENDING>(_local_memory)
.template rank_keys<Item, ElementsPerWorkItem>(item, unsigned_keys,
ranks, i, pass_bits);
.template rank_keys<Item, UnsignedT, ElementsPerWorkItem>(
item, unsigned_keys, ranks, i, pass_bits);

sycl::group_barrier(item.get_group());

Expand Down
65 changes: 62 additions & 3 deletions clang/test/dpct/cub/blocklevel/blockradixsort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,28 @@ __global__ void Sort(int *data) {
BlockStore(temp_storage_store).Store(data, thread_keys);
}

__global__ void SortHalf(__half *data) {
// CHECK: using BlockRadixSort = dpct::group::group_radix_sort<sycl::half, 4>;
// CHECK-NEXT: using BlockLoad = dpct::group::group_load<sycl::half, 4>;
// CHECK-NEXT: using BlockStore = dpct::group::group_store<sycl::half, 4>;
// CHECK-NOT: __shared__ typename BlockLoad::TempStorage temp_storage_load;
// CHECK-NOT: __shared__ typename BlockStore::TempStorage temp_storage_store;
// CHECK-NOT: __shared__ typename BlockRadixSort::TempStorage temp_storage;
using BlockRadixSort = cub::BlockRadixSort<__half, 128, 4>;
using BlockLoad = cub::BlockLoad<__half, 128, 4>;
using BlockStore = cub::BlockStore<__half, 128, 4>;
__shared__ typename BlockLoad::TempStorage temp_storage_load;
__shared__ typename BlockStore::TempStorage temp_storage_store;
__shared__ typename BlockRadixSort::TempStorage temp_storage;
__half thread_keys[4];
// CHECK: BlockLoad(temp_storage_load).load(item_ct1, data, thread_keys);
// CHECK-NEXT: BlockRadixSort(temp_storage).sort(item_ct1, thread_keys);
// CHECK-NEXT: BlockStore(temp_storage_store).store(item_ct1, data, thread_keys);
BlockLoad(temp_storage_load).Load(data, thread_keys);
BlockRadixSort(temp_storage).Sort(thread_keys);
BlockStore(temp_storage_store).Store(data, thread_keys);
}

__global__ void SortDescending(int *data) {
// CHECK: using BlockRadixSort = dpct::group::group_radix_sort<int, 4>;
// CHECK-NEXT: using BlockLoad = dpct::group::group_load<int, 4, dpct::group::group_load_algorithm::blocked>;
Expand Down Expand Up @@ -171,8 +193,9 @@ __global__ void test_unsupported(int *data) {

template <typename T, int N>
void print_array(T (&arr)[N]) {
for (int i = 0; i < N; ++i)
printf("%d%c", arr[i], (i == N - 1 ? '\n' : ','));
for (int i = 0; i < N; ++i) {
std::cout << (int)arr[i] << (i == N - 1 ? '\n' : ',');
}
}

bool test_sort() {
Expand Down Expand Up @@ -211,6 +234,42 @@ bool test_sort() {
return true;
}

bool test_sorthalf() {
__half data[512] = {0}, *d_data = nullptr;
cudaMalloc(&d_data, sizeof(__half) * 512);
for (int i = 0, x = 0, y = 511; i < 128; ++i) {
data[i * 4 + 0] = x++;
data[i * 4 + 1] = y--;
data[i * 4 + 2] = x++;
data[i * 4 + 3] = y--;
}
cudaMemcpy(d_data, data, sizeof(data), cudaMemcpyHostToDevice);
// CHECK: q_ct1.submit(
// CHECK-NEXT: [&](sycl::handler &cgh) {
// CHECK-NEXT: sycl::local_accessor<uint8_t, 1> temp_storage_load_acc(dpct::group::group_load<sycl::half, 4>::get_local_memory_size(sycl::range<3>(1, 1, 128).size()), cgh);
// CHECK-NEXT: sycl::local_accessor<uint8_t, 1> temp_storage_store_acc(dpct::group::group_store<sycl::half, 4>::get_local_memory_size(sycl::range<3>(1, 1, 128).size()), cgh);
// CHECK-NEXT: sycl::local_accessor<uint8_t, 1> temp_storage_acc(dpct::group::group_radix_sort<sycl::half, 4>::get_local_memory_size(sycl::range<3>(1, 1, 128).size()), cgh);
// CHECK-EMPTY:
// CHECK-NEXT: cgh.parallel_for(
// CHECK-NEXT: sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
// CHECK-NEXT: [=](sycl::nd_item<3> item_ct1) {
// CHECK-NEXT: SortHalf(d_data, &temp_storage_load_acc[0], &temp_storage_store_acc[0], &temp_storage_acc[0]);
// CHECK-NEXT: });
// CHECK-NEXT: });
SortHalf<<<1, 128>>>(d_data);
cudaDeviceSynchronize();
cudaMemcpy(data, d_data, sizeof(data), cudaMemcpyDeviceToHost);
cudaFree(d_data);
for (int i = 0; i < 512; ++i)
if ((int)data[i] != i) {
printf("test_sorthalf failed\n");
print_array(data);
return false;
}
printf("test_sorthalf pass\n");
return true;
}

bool test_sort_descending() {
int data[512] = {0}, *d_data = nullptr;
cudaMalloc(&d_data, sizeof(int) * 512);
Expand Down Expand Up @@ -610,7 +669,7 @@ bool test_sort_descending_blocked_to_striped_bit() {
}

int main() {
return !(test_sort() && test_sort_descending() &&
return !(test_sort() && test_sorthalf() && test_sort_descending() &&
test_sort_blocked_to_striped() &&
test_sort_descending_blocked_to_striped() && test_sort_bit() &&
test_sort_descending_bit() && test_sort_blocked_to_striped_bit() &&
Expand Down