|
22 | 22 | #include <thrust/type_traits/is_contiguous_iterator.h> |
23 | 23 |
|
24 | 24 | #include <cooperative_groups.h> |
| 25 | +#include <cooperative_groups/memcpy_async.h> |
25 | 26 |
|
26 | 27 | namespace cuco { |
27 | 28 | template <typename Key, |
@@ -497,23 +498,31 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_ |
497 | 498 | } |
498 | 499 | offset = g.shfl(offset, 0); |
499 | 500 |
|
500 | | - if constexpr (thrust::is_contiguous_iterator_v<OutputIt>) { |
501 | 501 | #if defined(CUCO_HAS_CG_MEMCPY_ASYNC) |
| 502 | + constexpr bool uses_memcpy_async = thrust::is_contiguous_iterator_v<OutputIt>; |
| 503 | +#else |
| 504 | + constexpr bool uses_memcpy_async = false; |
| 505 | +#endif // end CUCO_HAS_CG_MEMCPY_ASYNC |
| 506 | + |
| 507 | + if constexpr (uses_memcpy_async) { |
502 | 508 | #if defined(CUCO_HAS_CUDA_BARRIER) |
503 | 509 | cooperative_groups::memcpy_async( |
504 | 510 | g, |
505 | | - output_begin + offset, |
| 511 | + &thrust::raw_reference_cast(*(output_begin + offset)), |
506 | 512 | output_buffer, |
507 | 513 | cuda::aligned_size_t<alignof(value_type)>(sizeof(value_type) * num_outputs)); |
508 | 514 | #else |
509 | | - cooperative_groups::memcpy_async( |
510 | | - g, output_begin + offset, output_buffer, sizeof(value_type) * num_outputs); |
| 515 | + cooperative_groups::memcpy_async(g, |
| 516 | + &thrust::raw_reference_cast(*(output_begin + offset)), |
| 517 | + output_buffer, |
| 518 | + sizeof(value_type) * num_outputs); |
511 | 519 | #endif // end CUCO_HAS_CUDA_BARRIER |
512 | | - return; |
513 | | -#endif // end CUCO_HAS_CG_MEMCPY_ASYNC |
514 | 520 | } |
515 | | - for (auto index = lane_id; index < num_outputs; index += g.size()) { |
516 | | - *(output_begin + offset + index) = output_buffer[index]; |
| 521 | + |
| 522 | + if constexpr (not uses_memcpy_async) { |
| 523 | + for (auto index = lane_id; index < num_outputs; index += g.size()) { |
| 524 | + *(output_begin + offset + index) = output_buffer[index]; |
| 525 | + } |
517 | 526 | } |
518 | 527 | } |
519 | 528 |
|
|
0 commit comments