Skip to content

Commit a798873

Browse files
Use template wrapper<S, F>
1 parent 547b468 commit a798873

31 files changed

Lines changed: 721 additions & 208 deletions

.github/workflows/standalone-benchmark.yml

Lines changed: 30 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -12,52 +12,60 @@ jobs:
1212
runs-on: ${{ matrix.runner }}
1313
container: registry.cern.ch/alisw/slc9-gpu-builder:latest
1414
strategy:
15+
fail-fast: false
1516
matrix:
1617
name: [nvidia-h100, nvidia-l40s, amd-mi300x, amd-w7900]
1718
include:
1819
- name: nvidia-h100
1920
runner: cern-nextgen-h100
20-
cmake_args: -DENABLE_CUDA=1 -DENABLE_HIP=0 -DENABLE_OPENCL=0 -DCUDA_COMPUTETARGET=90
21-
ca_args: --gpuType CUDA --gpuDevice 0
21+
cmake_args: -DENABLE_CUDA=1 -DENABLE_HIP=0 -DCUDA_COMPUTETARGET=90
22+
ca_args: --gpuType CUDA #--RTCTECHloadLaunchBoundsFromFile genGPUArch/nvidia-h100.par
2223
- name: nvidia-l40s
2324
runner: cern-nextgen-l40s
24-
cmake_args: -DENABLE_CUDA=1 -DENABLE_HIP=0 -DENABLE_OPENCL=0 -DCUDA_COMPUTETARGET=89
25-
ca_args: --gpuType CUDA --gpuDevice 0
25+
cmake_args: -DENABLE_CUDA=1 -DENABLE_HIP=0 -DCUDA_COMPUTETARGET=89
26+
ca_args: --gpuType CUDA #--RTCTECHloadLaunchBoundsFromFile genGPUArch/nvidia-l40s.par
2627
- name: amd-mi300x
2728
runner: cern-nextgen-mi300x
28-
cmake_args: -DENABLE_CUDA=0 -DENABLE_HIP=1 -DENABLE_OPENCL=0 -DHIP_AMDGPUTARGET=gfx942
29-
ca_args: --gpuType HIP --gpuDevice 0 --RTCenable --RTCTECHloadLaunchBoundsFromFile genGPUArch/amd-mi300x.par
29+
cmake_args: -DENABLE_CUDA=0 -DENABLE_HIP=1 -DHIP_AMDGPUTARGET=gfx942
30+
ca_args: --gpuType HIP --RTCTECHloadLaunchBoundsFromFile genGPUArch/amd-mi300x.par
3031
- name: amd-w7900
3132
runner: cern-nextgen-w7900
32-
cmake_args: -DENABLE_CUDA=0 -DENABLE_HIP=1 -DENABLE_OPENCL=0 -DHIP_AMDGPUTARGET=gfx1100
33-
ca_args: --gpuType HIP --gpuDevice 0 --RTCenable --RTCTECHloadLaunchBoundsFromFile genGPUArch/amd-w7900.par
33+
cmake_args: -DENABLE_CUDA=0 -DENABLE_HIP=1 -DHIP_AMDGPUTARGET=gfx1100
34+
ca_args: --gpuType HIP --RTCTECHloadLaunchBoundsFromFile genGPUArch/amd-w7900.par
3435

3536
name: ${{ matrix.name }}
3637
steps:
3738
- name: Checkout Repository
3839
uses: actions/checkout@v4
3940

4041
- name: Build and Run
41-
continue-on-error: true
4242
run: |
43-
. ${WORK_DIR}/${ALIBUILD_ARCH_PREFIX}/O2/${O2_REVISION}/etc/profile.d/init.sh
44-
4543
mkdir -p ${STANDALONE_DIR}
44+
. ${WORK_DIR}/${ALIBUILD_ARCH_PREFIX}/O2/${O2_REVISION}/etc/profile.d/init.sh
4645
47-
curl -o /root/events.tar.xz https://cernbox.cern.ch/remote.php/dav/public-files/cuQAwSojyDrl6FR/events.tar.xz
48-
tar -xf /root/events.tar.xz -C ${STANDALONE_DIR}
49-
rm -f /root/events.tar.xz
50-
51-
curl -o /root/genGPUArch.tar.xz https://cernbox.cern.ch/remote.php/dav/public-files/3o2pvOVkINFU8qy/genGPUArch.tar.xz
52-
tar -xf /root/genGPUArch.tar.xz -C ${STANDALONE_DIR}
53-
rm -f /root/genGPUArch.tar.xz
54-
55-
cmake -B ${BUILD_DIR} ${{ matrix.cmake_args }} -DGPUCA_BUILD_EVENT_DISPLAY=0 -DCMAKE_INSTALL_PREFIX=${STANDALONE_DIR} ${GITHUB_WORKSPACE}/GPU/GPUTracking/Standalone/
46+
cmake -B ${BUILD_DIR} ${{ matrix.cmake_args }} -DENABLE_OPENCL=0 -DGPUCA_BUILD_EVENT_DISPLAY=0 -DGPUCA_DETERMINISTIC_MODE=GPU -DCMAKE_INSTALL_PREFIX=${STANDALONE_DIR} ${GITHUB_WORKSPACE}/GPU/GPUTracking/Standalone/
5647
cd ${BUILD_DIR}
5748
make install -j8
49+
5850
cd ${STANDALONE_DIR}
59-
${STANDALONE_DIR}/ca -e o2-simple -g ${{ matrix.ca_args }} --debug 1 > ${ARTIFACT_FILE}
60-
cat ${ARTIFACT_FILE}
51+
mkdir -p ${STANDALONE_DIR}/genGPUArch
52+
curl -v -o ${STANDALONE_DIR}/genGPUArch/${{ matrix.name }}.par https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/genGPUArch/${{ matrix.name }}.par
53+
54+
mkdir -p ${STANDALONE_DIR}/events
55+
56+
curl -v -o ${STANDALONE_DIR}/events/50kHz.tar.xz https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/events/50kHz.tar.xz
57+
tar -xf ${STANDALONE_DIR}/events/50kHz.tar.xz -C ${STANDALONE_DIR}/events
58+
${STANDALONE_DIR}/ca -e 50kHz -g --seed 0 --memSize 15000000000 --sync --runs 1 --RTCenable --PROCdeterministicGPUReconstruction 1 --RTCoptSpecialCode 1 --debug 1 ${{ matrix.ca_args }} > ${ARTIFACT_FILE}
59+
60+
curl -v -o ${STANDALONE_DIR}/events/o2-simple.tar.xz https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/events/o2-simple.tar.xz
61+
tar -xf ${STANDALONE_DIR}/events/o2-simple.tar.xz -C ${STANDALONE_DIR}/events
62+
${STANDALONE_DIR}/ca -e o2-simple -g --seed 0 --memSize 20000000000 --sync --runs 1 --RTCenable --PROCdeterministicGPUReconstruction 1 --RTCoptSpecialCode 1 --debug 6 ${{ matrix.ca_args }}
63+
64+
curl -v -o ${STANDALONE_DIR}/o2-simple-GPU.out https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/o2-simple-GPU.out
65+
cmp ${STANDALONE_DIR}/GPU.out ${STANDALONE_DIR}/o2-simple-GPU.out
66+
rm -rf ${STANDALONE_DIR}/GPU.out ${STANDALONE_DIR}/o2-simple-GPU.out
67+
68+
rm -rf ${STANDALONE_DIR}/events
6169
env:
6270
WORK_DIR: /cvmfs/alice.cern.ch
6371
ALIBUILD_ARCH_PREFIX: el9-x86_64/Packages

GPU/Common/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ set(HDRS_INSTALL
2626
GPUCommonTransform3D.h
2727
GPUROOTSMatrixFwd.h
2828
GPUROOTCartesianFwd.h
29-
GPUDebugStreamer.h)
29+
GPUDebugStreamer.h
30+
MemLayout.h)
3031

3132
if(ALIGPU_BUILD_TYPE STREQUAL "O2")
3233
o2_add_library(${MODULE}

GPU/Common/GPUCommonAlgorithm.h

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,19 +28,20 @@ namespace o2::gpu
2828
{
2929
class GPUCommonAlgorithm
3030
{
31+
3132
public:
3233
template <class T>
33-
GPUd() static void sort(T* begin, T* end);
34+
GPUd() static void sort(T begin, T end);
3435
template <class T>
3536
GPUd() static void sortInBlock(T* begin, T* end);
3637
template <class T>
37-
GPUd() static void sortDeviceDynamic(T* begin, T* end);
38+
GPUd() static void sortDeviceDynamic(T begin, T end);
3839
template <class T, class S>
39-
GPUd() static void sort(T* begin, T* end, const S& comp);
40+
GPUd() static void sort(T begin, T end, const S& comp);
4041
template <class T, class S>
4142
GPUd() static void sortInBlock(T* begin, T* end, const S& comp);
4243
template <class T, class S>
43-
GPUd() static void sortDeviceDynamic(T* begin, T* end, const S& comp);
44+
GPUd() static void sortDeviceDynamic(T begin, T end, const S& comp);
4445
#ifndef __OPENCL__
4546
template <class T, class S>
4647
GPUh() static void sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp);
@@ -224,7 +225,7 @@ namespace o2::gpu
224225
{
225226

226227
template <class T>
227-
GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end)
228+
GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T begin, T end)
228229
{
229230
#ifndef GPUCA_GPUCODE
230231
GPUCommonAlgorithm::sort(begin, end);
@@ -234,7 +235,7 @@ GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end)
234235
}
235236

236237
template <class T, class S>
237-
GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& comp)
238+
GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T begin, T end, const S& comp)
238239
{
239240
GPUCommonAlgorithm::sort(begin, end, comp);
240241
}
@@ -248,7 +249,7 @@ namespace o2::gpu
248249
{
249250

250251
template <class T>
251-
GPUdi() void GPUCommonAlgorithm::sort(T* begin, T* end)
252+
GPUdi() void GPUCommonAlgorithm::sort(T begin, T end)
252253
{
253254
#ifdef GPUCA_ALGORITHM_STD
254255
std::sort(begin, end);
@@ -258,7 +259,7 @@ GPUdi() void GPUCommonAlgorithm::sort(T* begin, T* end)
258259
}
259260

260261
template <class T, class S>
261-
GPUdi() void GPUCommonAlgorithm::sort(T* begin, T* end, const S& comp)
262+
GPUdi() void GPUCommonAlgorithm::sort(T begin, T end, const S& comp)
262263
{
263264
#ifdef GPUCA_ALGORITHM_STD
264265
std::sort(begin, end, comp);

GPU/Common/GPUCommonAlgorithmThrust.h

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
#include "GPUCommonDef.h"
2727
#include "GPUCommonHelpers.h"
28+
#include "GPUTPCTrack.h"
2829

2930
#ifndef __HIPCC__ // CUDA
3031
#include <cub/cub.cuh>
@@ -81,19 +82,15 @@ GPUdi() void GPUCommonAlgorithm::sortInBlock(T* begin, T* end, const S& comp)
8182
*/
8283

8384
template <class T>
84-
GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end)
85+
GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T begin, T end)
8586
{
86-
thrust::device_ptr<T> thrustBegin(begin);
87-
thrust::device_ptr<T> thrustEnd(end);
88-
thrust::sort(GPUCA_THRUST_NAMESPACE::par, thrustBegin, thrustEnd);
87+
thrust::sort(GPUCA_THRUST_NAMESPACE::par, begin, end);
8988
}
9089

9190
template <class T, class S>
92-
GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& comp)
91+
GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T begin, T end, const S& comp)
9392
{
94-
thrust::device_ptr<T> thrustBegin(begin);
95-
thrust::device_ptr<T> thrustEnd(end);
96-
thrust::sort(GPUCA_THRUST_NAMESPACE::par, thrustBegin, thrustEnd, comp);
93+
thrust::sort(GPUCA_THRUST_NAMESPACE::par, begin, end, comp);
9794
}
9895

9996
#ifndef GPUCA_GPUCODE_COMPILEKERNELS

0 commit comments

Comments
 (0)