99
1010#include < nvbench_helper.cuh>
1111
12+ // %RANGE% TUNE_LOAD ld 0:2:1
13+ // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
14+ // %RANGE% TUNE_THREADS_PER_BLOCK_POW2 tpb 6:10:1
15+
16+ #if !TUNE_BASE
17+ # if TUNE_LOAD == 0
18+ # define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
19+ # elif TUNE_LOAD == 1
20+ # define TUNE_LOAD_MODIFIER cub::LOAD_LDG
21+ # else // TUNE_LOAD == 2
22+ # define TUNE_LOAD_MODIFIER cub::LOAD_CA
23+ # endif // TUNE_LOAD
24+
25+ template <typename T>
26+ struct bench_policy_selector
27+ {
28+ [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto operator ()(::cuda::compute_capability) const
29+ -> cub::detail::find::find_policy
30+ {
31+ return cub::detail::find::find_policy{
32+ (1 << TUNE_THREADS_PER_BLOCK_POW2), cub::Nominal4BItemsToItems<T>(TUNE_ITEMS_PER_THREAD), 4 , TUNE_LOAD_MODIFIER};
33+ }
34+ };
35+ #endif // !TUNE_BASE
36+
1237template <typename T, typename OffsetT>
1338void find_if (nvbench::state& state, nvbench::type_list<T, OffsetT>)
1439{
@@ -23,33 +48,27 @@ void find_if(nvbench::state& state, nvbench::type_list<T, OffsetT>)
2348 thrust::fill (dinput.begin () + mismatch_point, dinput.end (), val);
2449 thrust::device_vector<OffsetT> d_result (1 , thrust::no_init);
2550
26- void * d_temp_storage = nullptr ;
27- size_t temp_storage_bytes{};
28-
2951 state.add_global_memory_reads <T>(mismatch_point);
3052 state.add_global_memory_writes <OffsetT>(1 );
3153
32- cub::DeviceFind::FindIf (
33- d_temp_storage,
34- temp_storage_bytes,
35- thrust::raw_pointer_cast (dinput.data ()),
36- thrust::raw_pointer_cast (d_result.data ()),
37- cuda::equal_to_value<T>(val),
38- static_cast <OffsetT>(dinput.size ()),
39- nullptr );
40-
41- thrust::device_vector<uint8_t > temp_storage (temp_storage_bytes, thrust::no_init);
42- d_temp_storage = thrust::raw_pointer_cast (temp_storage.data ());
43-
44- state.exec (nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
45- cub::DeviceFind::FindIf (
46- d_temp_storage,
47- temp_storage_bytes,
54+ caching_allocator_t alloc;
55+ state.exec (nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
56+ auto env = cub_bench_env (
57+ alloc,
58+ launch
59+ #if !TUNE_BASE
60+ ,
61+ cuda::execution::tune (bench_policy_selector<T>{})
62+ #endif // !TUNE_BASE
63+ );
64+ _CCCL_TRY_CUDA_API (
65+ cub::DeviceFind::FindIf,
66+ " FindIf failed" ,
4867 thrust::raw_pointer_cast (dinput.data ()),
4968 thrust::raw_pointer_cast (d_result.data ()),
5069 cuda::equal_to_value<T>(val),
5170 static_cast <OffsetT>(dinput.size ()),
52- launch. get_stream () );
71+ env );
5372 });
5473}
5574
0 commit comments