Skip to content

Commit 59e9bcf

Browse files
small adjustments and flatter init optimization
1 parent 6393bc4 commit 59e9bcf

4 files changed

Lines changed: 36 additions & 40 deletions

File tree

examples/getAvailableSlots/source/main.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,19 +123,22 @@ auto main() -> int
123123
if(result != EXIT_SUCCESS)
124124
return;
125125

126+
std::cout << alpaka::onHost::demangledName<FlatterScatter<FlatterScatterHeapConfig>>() << ":\n";
126127
result = runExample<
127128
Executor,
128129
FlatterScatter<FlatterScatterHeapConfig>,
129130
mallocMC::ReservePoolPolicies::AlpakaBuf>(deviceSpec, exec);
130131
if(result != EXIT_SUCCESS)
131132
return;
133+
std::cout << alpaka::onHost::demangledName<Scatter<FlatterScatterHeapConfig>>() << ":\n";
132134
result = runExample<Executor, Scatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf>(
133135
deviceSpec,
134136
exec);
135137
#if ALPAKA_LANG_CUDA
136138
# ifdef mallocMC_HAS_Gallatin_AVAILABLE
137139
if(result == EXIT_SUCCESS)
138140
{
141+
std::cout << alpaka::onHost::demangledName<mallocMC::CreationPolicies::GallatinCuda<>>() << ":\n";
139142
result = runExample<
140143
Executor,
141144
mallocMC::CreationPolicies::GallatinCuda<>,
@@ -145,7 +148,10 @@ auto main() -> int
145148
# endif
146149
#endif
147150
if(result == EXIT_SUCCESS)
151+
{
152+
std::cout << alpaka::onHost::demangledName<OldMalloc>() << ":\n";
148153
result = runExample<Executor, OldMalloc, mallocMC::ReservePoolPolicies::Noop>(deviceSpec, exec);
154+
}
149155
},
150156
alpaka::onHost::allBackends(alpaka::onHost::enabledDeviceSpecs, alpaka::exec::enabledExecutors));
151157
return result;

examples/native-cuda/source/main.cu

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,13 @@
2626
THE SOFTWARE.
2727
*/
2828

29+
#include "mallocMC/span.hpp"
30+
2931
#include <mallocMC/mallocMC.cuh>
3032

3133
#include <cstdint>
3234
#include <cstdlib>
33-
#include <iostream>
35+
#include <functional>
3436

3537
/**
3638
* @brief Computes the sum of squares of the first `n` natural numbers.
@@ -65,26 +67,19 @@ __device__ auto sumOfSquares(auto const n)
6567
*/
6668
__global__ void oneDotProductPerThread(mallocMC::CudaMemoryManager<> memoryManager, uint64_t numValues)
6769
{
70+
using mallocMC::span;
6871
uint64_t tid = threadIdx.x + blockIdx.x * blockDim.x;
6972

7073
// Not very realistic, all threads are doing this on their own:
71-
auto* a = reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t)));
72-
auto* b = reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t)));
73-
if(a == nullptr || b == nullptr)
74-
{
75-
printf("Thread %lu: device allocation failed.\n", tid);
76-
__trap();
77-
}
74+
auto a
75+
= span<uint64_t>(reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t))), numValues);
76+
auto b
77+
= span<uint64_t>(reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t))), numValues);
7878

79-
for(uint64_t i = 0; i < numValues; ++i)
80-
{
81-
a[i] = tid + i;
82-
b[i] = tid + i;
83-
}
79+
std::iota(std::begin(a), std::end(a), tid);
80+
std::iota(std::begin(b), std::end(b), tid);
8481

85-
uint64_t result = 0U;
86-
for(uint64_t i = 0; i < numValues; ++i)
87-
result += a[i] * b[i];
82+
uint64_t result = std::transform_reduce(std::cbegin(a), std::cend(a), std::cbegin(b), 0U);
8883

8984
auto expected = sumOfSquares(numValues + tid - 1) - (tid > 0 ? sumOfSquares(tid - 1) : 0);
9085
if(result != expected)
@@ -93,8 +88,8 @@ __global__ void oneDotProductPerThread(mallocMC::CudaMemoryManager<> memoryManag
9388
__trap();
9489
}
9590

96-
memoryManager.free(a);
97-
memoryManager.free(b);
91+
memoryManager.free(a.data());
92+
memoryManager.free(b.data());
9893
}
9994

10095
int main()
@@ -106,5 +101,4 @@ int main()
106101

107102
std::cout << "Running native CUDA kernel." << std::endl;
108103
oneDotProductPerThread<<<8, 256>>>(memoryManager, numValues);
109-
cudaDeviceSynchronize();
110104
}

examples/vectorAdd/source/main.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,12 @@ struct ShrinkConfig
4949
struct VectorAddKernel
5050
{
5151
template<typename TAcc, typename TAllocHandle, typename TSums>
52-
ALPAKA_FN_ACC void operator()(TAcc const& acc, TAllocHandle allocHandle, TSums sums, std::uint32_t len, std::uint32_t count)
53-
const
52+
ALPAKA_FN_ACC void operator()(
53+
TAcc const& acc,
54+
TAllocHandle allocHandle,
55+
TSums sums,
56+
std::uint32_t len,
57+
std::uint32_t count) const
5458
{
5559
for(auto [id] : alpaka::onAcc::makeIdxMap(acc, alpaka::onAcc::worker::threadsInGrid, alpaka::IdxRange{count}))
5660
{
@@ -121,10 +125,8 @@ auto runExample(auto const& deviceSpec, TExecutor exec) -> int
121125
std::cout << Allocator::info("\n") << '\n';
122126

123127
auto frameExtent = alpaka::Vec{Idx{threadsPerBlock}};
124-
auto frameSpec = alpaka::onHost::FrameSpec{
125-
alpaka::divCeil(alpaka::Vec{Idx{numWorkers}}, frameExtent),
126-
frameExtent,
127-
exec};
128+
auto frameSpec
129+
= alpaka::onHost::FrameSpec{alpaka::divCeil(alpaka::Vec{Idx{numWorkers}}, frameExtent), frameExtent, exec};
128130
queue.enqueue(
129131
frameSpec,
130132
alpaka::KernelBundle{VectorAddKernel{}, alloc.getAllocatorHandle(), sumsAcc, localLength, numWorkers});

include/mallocMC/creationPolicies/FlatterScatter.hpp

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -81,18 +81,13 @@ namespace mallocMC::CreationPolicies::FlatterScatterAlloc
8181
ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto init(auto const& acc, void* accessBlocksPointer, auto heapSize)
8282
-> void
8383
{
84-
auto const threadsInGrid = acc.getExtentsOf(alpaka::onAcc::origin::grid, alpaka::onAcc::unit::threads);
85-
auto const numThreads = threadsInGrid.product();
86-
auto const idx = static_cast<uint32_t>(alpaka::linearize(
87-
threadsInGrid,
88-
acc.getIdxWithin(alpaka::onAcc::origin::grid, alpaka::onAcc::unit::threads)));
8984
auto* accessBlocks = static_cast<MyAccessBlock*>(accessBlocksPointer);
9085

91-
for(uint32_t i = idx; i < numBlocks(heapSize) * MyAccessBlock::numPages(); i += numThreads)
86+
for(auto [blockIdx, pageIdx] : alpaka::onAcc::makeIdxMap(
87+
acc,
88+
alpaka::onAcc::worker::threadsInGrid,
89+
alpaka::IdxRange{alpaka::Vec{numBlocks(heapSize), MyAccessBlock::numPages()}}))
9290
{
93-
auto blockIdx = i / MyAccessBlock::numPages();
94-
auto pageIdx = i % MyAccessBlock::numPages();
95-
9691
accessBlocks[blockIdx].init(acc, pageIdx);
9792
}
9893
}
@@ -343,7 +338,6 @@ namespace mallocMC::CreationPolicies::FlatterScatterAlloc
343338
Heap<T_HeapConfig, T_HashConfig, T_AlignmentPolicy>::init(acc, m_heapmem, m_memsize);
344339
}
345340
};
346-
347341
} // namespace mallocMC::CreationPolicies::FlatterScatterAlloc
348342

349343
namespace mallocMC::CreationPolicies
@@ -421,9 +415,11 @@ namespace mallocMC::CreationPolicies
421415
return;
422416
}
423417
auto numPagesPerBlock = MyHeap::MyAccessBlock::numPages();
424-
queue.enqueue(
425-
alpaka::onHost::FrameSpec{alpaka::Vec{numBlocks}, alpaka::Vec{numPagesPerBlock}, TExecutor{}},
426-
alpaka::KernelBundle{FlatterScatterAlloc::InitKernel{}, heap, pool, memsize});
418+
auto frameSpec = alpaka::onHost::getFrameSpec<uint32_t>(
419+
queue.getDevice(),
420+
TExecutor{},
421+
alpaka::Vec{numBlocks, numPagesPerBlock});
422+
queue.enqueue(frameSpec, alpaka::KernelBundle{FlatterScatterAlloc::InitKernel{}, heap, pool, memsize});
427423
alpaka::onHost::wait(queue);
428424
}
429425

@@ -479,6 +475,4 @@ namespace mallocMC::CreationPolicies
479475
using HashConfig = T_HashConfig;
480476
};
481477
};
482-
483-
484478
} // namespace mallocMC::CreationPolicies

0 commit comments

Comments
 (0)