Skip to content

Commit 06c277f

Browse files
committed
Make argument wrappers construction public
1 parent 6b7ae38 commit 06c277f

20 files changed

Lines changed: 457 additions & 422 deletions

cub/benchmarks/bench/segmented_topk/fixed/keys.cu

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#include <cub/detail/choose_offset.cuh>
55
#include <cub/device/dispatch/dispatch_batched_topk.cuh>
66

7-
#include <cuda/__argument_>
7+
#include <cuda/argument>
88
#include <cuda/iterator>
99

1010
#include <nvbench_helper.cuh>
@@ -51,7 +51,7 @@ void fixed_seg_size_topk_keys(
5151
const auto selected_elements = static_cast<::cuda::std::ptrdiff_t>(MaxNumSelected);
5252
const auto num_segments = ::cuda::std::max<std::size_t>(1, (max_elements / segment_size));
5353
const auto elements = num_segments * segment_size;
54-
const auto total_num_items = ::cuda::__argument::__immediate{static_cast<::cuda::std::int64_t>(elements)};
54+
const auto total_num_items = ::cuda::argument::immediate{static_cast<::cuda::std::int64_t>(elements)};
5555
const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));
5656

5757
// Skip workloads where k exceeds the segment size
@@ -68,9 +68,9 @@ void fixed_seg_size_topk_keys(
6868
auto d_keys_in = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_in_ptr), segment_size);
6969
auto d_keys_out = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_out_ptr), selected_elements);
7070

71-
auto segment_sizes = ::cuda::__argument::__constant<MaxSegmentSize>{};
72-
auto k = ::cuda::__argument::__constant<MaxNumSelected>{};
73-
auto select_direction = ::cuda::__argument::__constant<cub::detail::topk::select::max>{};
71+
auto segment_sizes = ::cuda::argument::constant<MaxSegmentSize>{};
72+
auto k = ::cuda::argument::constant<MaxNumSelected>{};
73+
auto select_direction = ::cuda::argument::constant<cub::detail::topk::select::max>{};
7474

7575
state.add_element_count(elements, "NumElements");
7676
state.add_element_count(segment_size, "SegmentSize");
@@ -99,7 +99,7 @@ void fixed_seg_size_topk_keys(
9999
segment_sizes,
100100
k,
101101
select_direction,
102-
::cuda::__argument::__immediate{static_cast<::cuda::std::int64_t>(num_segments)},
102+
::cuda::argument::immediate{static_cast<::cuda::std::int64_t>(num_segments)},
103103
total_num_items,
104104
env);
105105
});

cub/benchmarks/bench/segmented_topk/variable/keys.cu

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
#include <thrust/reduce.h>
99
#include <thrust/tabulate.h>
1010

11-
#include <cuda/__argument_>
11+
#include <cuda/argument>
1212
#include <cuda/iterator>
1313
#include <cuda/random>
1414
#include <cuda/std/algorithm>
@@ -172,17 +172,17 @@ void variable_seg_size_topk_keys(nvbench::state& state,
172172
static_cast<cuda::std::int64_t>(MaxSegmentSize));
173173
const auto input_elements = thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end());
174174
const auto output_elements = static_cast<std::size_t>(num_segments) * K;
175-
const auto total_num_items = ::cuda::__argument::__immediate{static_cast<cuda::std::int64_t>(input_elements)};
175+
const auto total_num_items = ::cuda::argument::immediate{static_cast<cuda::std::int64_t>(input_elements)};
176176

177177
auto in_keys_buffer = gen_data<MaxSegmentSize, K>(
178178
num_segments, string_to_pattern(state.get_string("Pattern")), thrust::raw_pointer_cast(d_segment_sizes.data()));
179179
auto out_keys_buffer = thrust::device_vector<KeyT>(output_elements, thrust::no_init);
180180

181-
auto segment_sizes_param = ::cuda::__argument::__immediate_sequence{
182-
thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::__argument::__bounds<1, MaxSegmentSize>()};
183-
auto k_param = ::cuda::__argument::__constant<K>{};
184-
auto select_direction = ::cuda::__argument::__constant<cub::detail::topk::select::max>{};
185-
auto num_segments_param = ::cuda::__argument::__immediate{static_cast<cuda::std::int64_t>(num_segments)};
181+
auto segment_sizes_param = ::cuda::argument::immediate_sequence{
182+
thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::argument::bounds<1, MaxSegmentSize>()};
183+
auto k_param = ::cuda::argument::constant<K>{};
184+
auto select_direction = ::cuda::argument::constant<cub::detail::topk::select::max>{};
185+
auto num_segments_param = ::cuda::argument::immediate{static_cast<cuda::std::int64_t>(num_segments)};
186186

187187
auto d_keys_in = cuda::make_strided_iterator(
188188
cuda::make_counting_iterator(thrust::raw_pointer_cast(in_keys_buffer.data())),

cub/cub/agent/agent_batched_topk.cuh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323
#include <cub/device/dispatch/tuning/tuning_batched_topk.cuh>
2424
#include <cub/util_type.cuh>
2525

26-
#include <cuda/__argument_>
2726
#include <cuda/__cmath/ceil_div.h>
27+
#include <cuda/argument>
2828

2929
CUB_NAMESPACE_BEGIN
3030

@@ -73,8 +73,8 @@ struct agent_batched_topk_worker_per_segment
7373
using key_t = it_value_t<key_it_t>;
7474
using value_t = it_value_t<value_it_t>;
7575

76-
using segment_size_val_t = typename ::cuda::__argument::__traits<SegmentSizeParameterT>::element_type;
77-
using num_segments_val_t = typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type;
76+
using segment_size_val_t = typename ::cuda::argument::__traits<SegmentSizeParameterT>::element_type;
77+
using num_segments_val_t = typename ::cuda::argument::__traits<NumSegmentsParameterT>::element_type;
7878
using counters_t = batched_topk_counters<num_segments_val_t>;
7979

8080
static constexpr auto policy = PolicyGetter{}();
@@ -95,7 +95,7 @@ struct agent_batched_topk_worker_per_segment
9595
multi_worker_per_segment_policy.threads_per_block * multi_worker_per_segment_policy.items_per_thread;
9696

9797
// Check if there could be large segments present
98-
static constexpr bool only_small_segments = ::cuda::__argument::__traits<SegmentSizeParameterT>::highest <= tile_size;
98+
static constexpr bool only_small_segments = ::cuda::argument::__traits<SegmentSizeParameterT>::highest <= tile_size;
9999

100100
// Check if we are dealing with keys-only or key-value pairs
101101
static constexpr bool is_keys_only = ::cuda::std::is_same_v<value_t, cub::NullType>;
@@ -196,8 +196,8 @@ struct agent_batched_topk_worker_per_segment
196196
return;
197197
}
198198

199-
constexpr bool is_full_tile = ::cuda::__argument::__traits<SegmentSizeParameterT>::is_constant
200-
&& ::cuda::__argument::__traits<SegmentSizeParameterT>::lowest == tile_size;
199+
constexpr bool is_full_tile = ::cuda::argument::__traits<SegmentSizeParameterT>::is_constant
200+
&& ::cuda::argument::__traits<SegmentSizeParameterT>::lowest == tile_size;
201201

202202
// Resolve Segment Parameters
203203
const auto segment_size = params::get_param(segment_sizes, segment_id);

cub/cub/detail/segmented_params.cuh

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# pragma system_header
1414
#endif // no system header
1515

16-
#include <cuda/__argument_>
16+
#include <cuda/argument>
1717
#include <cuda/std/__type_traits/integral_constant.h>
1818
#include <cuda/std/__type_traits/remove_cvref.h>
1919
#include <cuda/std/__utility/forward.h>
@@ -33,10 +33,10 @@ namespace detail::params
3333
//! @param[in] __index Segment index to read for sequence arguments.
3434
//! @return The single argument value, or the sequence element at the given index.
3535
_CCCL_TEMPLATE(class _Tp, class _SegmentIndexT)
36-
_CCCL_REQUIRES((!::cuda::__argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
36+
_CCCL_REQUIRES((!::cuda::argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
3737
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(_Tp&& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
3838
{
39-
if constexpr (::cuda::__argument::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value)
39+
if constexpr (::cuda::argument::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value)
4040
{
4141
return __arg;
4242
}
@@ -48,44 +48,44 @@ _CCCL_REQUIRES((!::cuda::__argument::__is_wrapper_v<::cuda::std::remove_cvref_t<
4848

4949
template <auto _Value, class _SegmentIndexT>
5050
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
51-
get_param(const ::cuda::__argument::__constant<_Value>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
51+
get_param(const ::cuda::argument::constant<_Value>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
5252
{
53-
return ::cuda::__argument::__unwrap(__arg);
53+
return ::cuda::argument::__unwrap(__arg);
5454
}
5555

5656
template <auto _Value, class _SegmentIndexT>
5757
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
58-
get_param(const ::cuda::__argument::__constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept
58+
get_param(const ::cuda::argument::constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept
5959
{
60-
return ::cuda::__argument::__unwrap(__arg)[__index];
60+
return ::cuda::argument::__unwrap(__arg)[__index];
6161
}
6262

6363
template <class _Arg, class _StaticBounds, class _SegmentIndexT>
6464
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(
65-
const ::cuda::__argument::__immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
65+
const ::cuda::argument::immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
6666
{
67-
return ::cuda::__argument::__unwrap(__arg);
67+
return ::cuda::argument::__unwrap(__arg);
6868
}
6969

7070
template <class _Arg, class _StaticBounds, class _SegmentIndexT>
7171
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
72-
get_param(const ::cuda::__argument::__immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
72+
get_param(const ::cuda::argument::immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
7373
{
74-
return ::cuda::__argument::__unwrap(__arg)[__index];
74+
return ::cuda::argument::__unwrap(__arg)[__index];
7575
}
7676

7777
template <class _Arg, class _StaticBounds, class _SegmentIndexT>
7878
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(
79-
const ::cuda::__argument::__deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
79+
const ::cuda::argument::deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
8080
{
81-
return ::cuda::__argument::__unwrap(__arg);
81+
return ::cuda::argument::__unwrap(__arg);
8282
}
8383

8484
template <class _Arg, class _StaticBounds, class _SegmentIndexT>
8585
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
86-
get_param(const ::cuda::__argument::__deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
86+
get_param(const ::cuda::argument::deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
8787
{
88-
return ::cuda::__argument::__unwrap(__arg)[__index];
88+
return ::cuda::argument::__unwrap(__arg)[__index];
8989
}
9090

9191
// =====================================================================

cub/cub/device/dispatch/dispatch_batched_topk.cuh

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,10 @@
3131

3232
#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
3333

34-
#include <cuda/__argument_>
3534
#include <cuda/__cmath/ceil_div.h>
3635
#include <cuda/__iterator/counting_iterator.h>
3736
#include <cuda/__iterator/transform_iterator.h>
37+
#include <cuda/argument>
3838
#include <cuda/std/__functional/operations.h>
3939
#include <cuda/std/__type_traits/is_same.h>
4040
#include <cuda/std/__type_traits/remove_cv.h>
@@ -49,9 +49,9 @@ namespace detail::batched_topk
4949
// Internal: wrap user-facing select direction into discrete param for dispatch
5050
// -----------------------------------------------------------------------------
5151

52-
// Uniform (compile-time): __constant<Dir> -> single-option uniform_discrete_param.
52+
// Uniform (compile-time): constant<Dir> -> single-option uniform_discrete_param.
5353
template <detail::topk::select Dir>
54-
[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::__argument::__constant<Dir>)
54+
[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::argument::constant<Dir>)
5555
{
5656
return params::uniform_discrete_param<detail::topk::select, Dir>{Dir};
5757
}
@@ -126,7 +126,7 @@ template <typename KeyInputItItT,
126126
typename PolicySelector = policy_selector_from_types<it_value_t<it_value_t<KeyInputItItT>>,
127127
it_value_t<it_value_t<ValueInputItItT>>,
128128
::cuda::std::int64_t,
129-
::cuda::__argument::__traits<KParameterT>::highest>>
129+
::cuda::argument::__traits<KParameterT>::highest>>
130130
#if _CCCL_HAS_CONCEPTS()
131131
requires batched_topk_policy_selector<PolicySelector>
132132
#endif // _CCCL_HAS_CONCEPTS()
@@ -145,7 +145,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
145145
cudaStream_t stream = nullptr,
146146
[[maybe_unused]] PolicySelector policy_selector = {})
147147
{
148-
using large_segment_tile_offset_t = typename ::cuda::__argument::__traits<TotalNumItemsGuaranteeT>::element_type;
148+
using large_segment_tile_offset_t = typename ::cuda::argument::__traits<TotalNumItemsGuaranteeT>::element_type;
149149

150150
// Wrap the raw enum into the internal discrete param type
151151
auto select_directions = wrap_select_direction(select_direction);
@@ -171,9 +171,9 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
171171
static constexpr int worker_per_segment_tile_size =
172172
worker_per_segment_policy.threads_per_block * worker_per_segment_policy.items_per_thread;
173173
static constexpr bool any_small_segments =
174-
::cuda::__argument::__traits<SegmentSizeParameterT>::lowest <= worker_per_segment_tile_size;
174+
::cuda::argument::__traits<SegmentSizeParameterT>::lowest <= worker_per_segment_tile_size;
175175
static constexpr bool only_small_segments =
176-
::cuda::__argument::__traits<SegmentSizeParameterT>::highest <= worker_per_segment_tile_size;
176+
::cuda::argument::__traits<SegmentSizeParameterT>::highest <= worker_per_segment_tile_size;
177177

178178
// Allocation layout:
179179
// only_small_segments: [0] dummy.
@@ -183,7 +183,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
183183
static constexpr int allocations_array_size = only_small_segments ? 1 : (any_small_segments ? 3 : 2);
184184
size_t allocation_sizes[allocations_array_size] = {1};
185185

186-
using num_segments_val_t = typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type;
186+
using num_segments_val_t = typename ::cuda::argument::__traits<NumSegmentsParameterT>::element_type;
187187
using counters_t = batched_topk_counters<num_segments_val_t>;
188188
using segment_size_scan_offset_t = detail::choose_offset_t<num_segments_val_t>;
189189
using segment_size_scan_input_op_t =
@@ -239,7 +239,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
239239

240240
// TODO (elstehle): support number of segments provided by device-accessible iterator
241241
// Only uniform number of segments are supported (i.e., we need to resolve the number of segments on the host)
242-
static_assert(::cuda::__argument::__traits<NumSegmentsParameterT>::is_single_value,
242+
static_assert(::cuda::argument::__traits<NumSegmentsParameterT>::is_single_value,
243243
"Only uniform segment sizes are currently supported.");
244244

245245
if constexpr (any_small_segments)
@@ -341,7 +341,7 @@ template <typename KeyInputItItT,
341341
policy_selector_from_types<it_value_t<it_value_t<KeyInputItItT>>,
342342
it_value_t<it_value_t<ValueInputItItT>>,
343343
::cuda::std::int64_t,
344-
::cuda::__argument::__traits<KParameterT>::highest>;
344+
::cuda::argument::__traits<KParameterT>::highest>;
345345
return detail::dispatch_with_env_and_tuning<default_policy_selector>(
346346
env, [&](auto policy_selector, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) {
347347
return dispatch(

cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@
2020
#include <cub/device/dispatch/tuning/tuning_batched_topk.cuh>
2121
#include <cub/util_arch.cuh>
2222

23-
#include <cuda/__argument_>
2423
#include <cuda/__device/compute_capability.h>
24+
#include <cuda/argument>
2525

2626
CUB_NAMESPACE_BEGIN
2727

@@ -39,7 +39,7 @@ private:
3939
worker_policy worker_per_segment_policy;
4040
multi_worker_policy multi_worker_per_segment_policy;
4141
};
42-
static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::__argument::__traits<SegmentSizeParameterT>::highest;
42+
static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::argument::__traits<SegmentSizeParameterT>::highest;
4343
static constexpr batched_topk_policy active_policy = current_policy<PolicySelector>();
4444

4545
template <int Index>
@@ -133,8 +133,8 @@ __launch_bounds__(int(
133133
KParameterT k,
134134
SelectDirectionParameterT select_directions,
135135
NumSegmentsParameterT num_segments,
136-
batched_topk_counters<typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type>* d_counters,
137-
typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type* d_large_segments_ids,
136+
batched_topk_counters<typename ::cuda::argument::__traits<NumSegmentsParameterT>::element_type>* d_counters,
137+
typename ::cuda::argument::__traits<NumSegmentsParameterT>::element_type* d_large_segments_ids,
138138
LargeSegmentTileOffsetT* d_large_segments_tile_offsets)
139139
{
140140
using agent_t = typename find_smallest_covering_policy<
@@ -151,7 +151,7 @@ __launch_bounds__(int(
151151
LargeSegmentTileOffsetT>::agent_t;
152152

153153
// Static Assertions (Constraints)
154-
static_assert(agent_t::tile_size >= ::cuda::__argument::__traits<SegmentSizeParameterT>::highest,
154+
static_assert(agent_t::tile_size >= ::cuda::argument::__traits<SegmentSizeParameterT>::highest,
155155
"Block size exceeds maximum segment size supported by SegmentSizeParameterT");
156156
static_assert(sizeof(typename agent_t::TempStorage) <= max_smem_per_block,
157157
"Static shared memory per block must not exceed 48KB limit.");

cub/test/catch2_test_device_segmented_topk_keys.cu

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -151,11 +151,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small fixed-size segments",
151151
batched_topk_keys(
152152
d_keys_in,
153153
d_keys_out,
154-
::cuda::__argument::__immediate{segment_size, ::cuda::__argument::__bounds<segment_size_t{1}, max_segment_size>()},
155-
::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_k>()},
154+
::cuda::argument::immediate{segment_size, ::cuda::argument::bounds<segment_size_t{1}, max_segment_size>()},
155+
::cuda::argument::immediate{k, ::cuda::argument::bounds<segment_size_t{1}, static_max_k>()},
156156
direction,
157-
::cuda::__argument::__immediate{num_segments},
158-
::cuda::__argument::__immediate{num_segments * segment_size});
157+
::cuda::argument::immediate{num_segments},
158+
::cuda::argument::immediate{num_segments * segment_size});
159159
// Prepare expected results
160160
fixed_size_segmented_sort_keys(expected_keys, num_segments, segment_size, direction);
161161
compact_sorted_keys_to_topk(expected_keys, segment_size, k);
@@ -248,12 +248,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment
248248
batched_topk_keys(
249249
d_keys_in,
250250
d_keys_out,
251-
::cuda::__argument::__immediate_sequence{
252-
segment_size_it, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_segment_size>()},
253-
::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_k>()},
251+
::cuda::argument::immediate_sequence{
252+
segment_size_it, ::cuda::argument::bounds<segment_size_t{1}, static_max_segment_size>()},
253+
::cuda::argument::immediate{k, ::cuda::argument::bounds<segment_size_t{1}, static_max_k>()},
254254
direction,
255-
::cuda::__argument::__immediate{num_segments},
256-
::cuda::__argument::__immediate{num_items});
255+
::cuda::argument::immediate{num_segments},
256+
::cuda::argument::immediate{num_items});
257257

258258
// Verify keys are returned correctly: sort each segment of the expected input, then compact the top-k
259259
segmented_sort_keys(expected_keys, num_segments, segment_offsets.cbegin(), segment_offsets.cbegin() + 1, direction);
@@ -286,12 +286,11 @@ C2H_TEST("DeviceBatchedTopK::MinKeys preserves -0.0f in output", "[keys][segment
286286
batched_topk_keys(
287287
d_keys_in_it,
288288
d_keys_out_it,
289-
::cuda::__argument::__immediate{
290-
segment_size, ::cuda::__argument::__bounds<cuda::std::int64_t{1}, max_segment_size>()},
291-
::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds<cuda::std::int64_t{1}, k>()},
289+
::cuda::argument::immediate{segment_size, ::cuda::argument::bounds<cuda::std::int64_t{1}, max_segment_size>()},
290+
::cuda::argument::immediate{k, ::cuda::argument::bounds<cuda::std::int64_t{1}, k>()},
292291
cub::detail::topk::select::min,
293-
::cuda::__argument::__immediate{num_segments},
294-
::cuda::__argument::__immediate{num_segments * segment_size});
292+
::cuda::argument::immediate{num_segments},
293+
::cuda::argument::immediate{num_segments * segment_size});
295294

296295
const int num_minus_zero = static_cast<int>(thrust::count_if(d_keys_out.begin(), d_keys_out.end(), is_minus_zero{}));
297296
REQUIRE(num_minus_zero >= 1);

0 commit comments

Comments
 (0)