cern-nextgen
diff --git a/‎.github/workflows/standalone-benchmark.yml‎
Lines changed: 30 additions & 22 deletions b/‎.github/workflows/standalone-benchmark.yml‎
Lines changed: 30 additions & 22 deletions
diff --git a/‎GPU/Common/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion b/‎GPU/Common/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎GPU/Common/GPUCommonAlgorithm.h‎
Lines changed: 9 additions & 8 deletions b/‎GPU/Common/GPUCommonAlgorithm.h‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎GPU/Common/GPUCommonAlgorithmThrust.h‎
Lines changed: 5 additions & 8 deletions b/‎GPU/Common/GPUCommonAlgorithmThrust.h‎
Lines changed: 5 additions & 8 deletions
@@ -12,52 +12,60 @@ jobs:
     runs-on: ${{ matrix.runner }}
     container: registry.cern.ch/alisw/slc9-gpu-builder:latest
     strategy:
+      fail-fast: false
       matrix:
         name: [nvidia-h100, nvidia-l40s, amd-mi300x, amd-w7900]
         include:
           - name: nvidia-h100
             runner: cern-nextgen-h100
-            cmake_args: -DENABLE_CUDA=1 -DENABLE_HIP=0 -DENABLE_OPENCL=0 -DCUDA_COMPUTETARGET=90
-            ca_args: --gpuType CUDA --gpuDevice 0
+            cmake_args: -DENABLE_CUDA=1 -DENABLE_HIP=0 -DCUDA_COMPUTETARGET=90
+            ca_args: --gpuType CUDA #--RTCTECHloadLaunchBoundsFromFile genGPUArch/nvidia-h100.par
           - name: nvidia-l40s
             runner: cern-nextgen-l40s
-            cmake_args: -DENABLE_CUDA=1 -DENABLE_HIP=0 -DENABLE_OPENCL=0 -DCUDA_COMPUTETARGET=89
-            ca_args: --gpuType CUDA --gpuDevice 0
+            cmake_args: -DENABLE_CUDA=1 -DENABLE_HIP=0 -DCUDA_COMPUTETARGET=89
+            ca_args: --gpuType CUDA #--RTCTECHloadLaunchBoundsFromFile genGPUArch/nvidia-l40s.par
           - name: amd-mi300x
             runner: cern-nextgen-mi300x
-            cmake_args: -DENABLE_CUDA=0 -DENABLE_HIP=1 -DENABLE_OPENCL=0 -DHIP_AMDGPUTARGET=gfx942
-            ca_args: --gpuType HIP --gpuDevice 0 --RTCenable --RTCTECHloadLaunchBoundsFromFile genGPUArch/amd-mi300x.par
+            cmake_args: -DENABLE_CUDA=0 -DENABLE_HIP=1 -DHIP_AMDGPUTARGET=gfx942
+            ca_args: --gpuType HIP --RTCTECHloadLaunchBoundsFromFile genGPUArch/amd-mi300x.par
           - name: amd-w7900
             runner: cern-nextgen-w7900
-            cmake_args: -DENABLE_CUDA=0 -DENABLE_HIP=1 -DENABLE_OPENCL=0 -DHIP_AMDGPUTARGET=gfx1100
-            ca_args: --gpuType HIP --gpuDevice 0 --RTCenable --RTCTECHloadLaunchBoundsFromFile genGPUArch/amd-w7900.par
+            cmake_args: -DENABLE_CUDA=0 -DENABLE_HIP=1 -DHIP_AMDGPUTARGET=gfx1100
+            ca_args: --gpuType HIP --RTCTECHloadLaunchBoundsFromFile genGPUArch/amd-w7900.par
 
     name: ${{ matrix.name }}
     steps:
       - name: Checkout Repository
         uses: actions/checkout@v4
 
       - name: Build and Run
-        continue-on-error: true
         run: |
-          . ${WORK_DIR}/${ALIBUILD_ARCH_PREFIX}/O2/${O2_REVISION}/etc/profile.d/init.sh
-
           mkdir -p ${STANDALONE_DIR}
+          . ${WORK_DIR}/${ALIBUILD_ARCH_PREFIX}/O2/${O2_REVISION}/etc/profile.d/init.sh
 
-          curl -o /root/events.tar.xz https://cernbox.cern.ch/remote.php/dav/public-files/cuQAwSojyDrl6FR/events.tar.xz
-          tar -xf /root/events.tar.xz -C ${STANDALONE_DIR}
-          rm -f /root/events.tar.xz
-
-          curl -o /root/genGPUArch.tar.xz https://cernbox.cern.ch/remote.php/dav/public-files/3o2pvOVkINFU8qy/genGPUArch.tar.xz
-          tar -xf /root/genGPUArch.tar.xz -C ${STANDALONE_DIR}
-          rm -f /root/genGPUArch.tar.xz
-
-          cmake -B ${BUILD_DIR} ${{ matrix.cmake_args }} -DGPUCA_BUILD_EVENT_DISPLAY=0 -DCMAKE_INSTALL_PREFIX=${STANDALONE_DIR} ${GITHUB_WORKSPACE}/GPU/GPUTracking/Standalone/
+          cmake -B ${BUILD_DIR} ${{ matrix.cmake_args }} -DENABLE_OPENCL=0 -DGPUCA_BUILD_EVENT_DISPLAY=0 -DGPUCA_DETERMINISTIC_MODE=GPU -DCMAKE_INSTALL_PREFIX=${STANDALONE_DIR} ${GITHUB_WORKSPACE}/GPU/GPUTracking/Standalone/
           cd ${BUILD_DIR}
           make install -j8
+
           cd ${STANDALONE_DIR}
-          ${STANDALONE_DIR}/ca -e o2-simple -g ${{ matrix.ca_args }} --debug 1 > ${ARTIFACT_FILE}
-          cat ${ARTIFACT_FILE}
+          mkdir -p ${STANDALONE_DIR}/genGPUArch
+          curl -v -o ${STANDALONE_DIR}/genGPUArch/${{ matrix.name }}.par https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/genGPUArch/${{ matrix.name }}.par
+
+          mkdir -p ${STANDALONE_DIR}/events
+
+          curl -v -o ${STANDALONE_DIR}/events/50kHz.tar.xz https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/events/50kHz.tar.xz
+          tar -xf ${STANDALONE_DIR}/events/50kHz.tar.xz -C ${STANDALONE_DIR}/events
+          ${STANDALONE_DIR}/ca -e 50kHz -g --seed 0 --memSize 15000000000 --sync --runs 1 --RTCenable --PROCdeterministicGPUReconstruction 1 --RTCoptSpecialCode 1 --debug 1 ${{ matrix.ca_args }} > ${ARTIFACT_FILE}
+
+          curl -v -o ${STANDALONE_DIR}/events/o2-simple.tar.xz https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/events/o2-simple.tar.xz
+          tar -xf ${STANDALONE_DIR}/events/o2-simple.tar.xz -C ${STANDALONE_DIR}/events
+          ${STANDALONE_DIR}/ca -e o2-simple -g --seed 0 --memSize 20000000000 --sync --runs 1 --RTCenable --PROCdeterministicGPUReconstruction 1 --RTCoptSpecialCode 1 --debug 6 ${{ matrix.ca_args }}
+          
+          curl -v -o ${STANDALONE_DIR}/o2-simple-GPU.out https://cernbox.cern.ch/remote.php/dav/public-files/SfYXgQOHFga2w75/o2-simple-GPU.out
+          cmp ${STANDALONE_DIR}/GPU.out ${STANDALONE_DIR}/o2-simple-GPU.out
+          rm -rf ${STANDALONE_DIR}/GPU.out ${STANDALONE_DIR}/o2-simple-GPU.out
+
+          rm -rf ${STANDALONE_DIR}/events
         env:
           WORK_DIR: /cvmfs/alice.cern.ch
           ALIBUILD_ARCH_PREFIX: el9-x86_64/Packages
 
@@ -26,7 +26,8 @@ set(HDRS_INSTALL
     GPUCommonTransform3D.h
     GPUROOTSMatrixFwd.h
     GPUROOTCartesianFwd.h
-    GPUDebugStreamer.h)
+    GPUDebugStreamer.h
+    MemLayout.h)
 
 if(ALIGPU_BUILD_TYPE STREQUAL "O2")
   o2_add_library(${MODULE}
 
@@ -28,19 +28,20 @@ namespace o2::gpu
 {
 class GPUCommonAlgorithm
 {
+
  public:
   template <class T>
-  GPUd() static void sort(T* begin, T* end);
+  GPUd() static void sort(T begin, T end);
   template <class T>
   GPUd() static void sortInBlock(T* begin, T* end);
   template <class T>
-  GPUd() static void sortDeviceDynamic(T* begin, T* end);
+  GPUd() static void sortDeviceDynamic(T begin, T end);
   template <class T, class S>
-  GPUd() static void sort(T* begin, T* end, const S& comp);
+  GPUd() static void sort(T begin, T end, const S& comp);
   template <class T, class S>
   GPUd() static void sortInBlock(T* begin, T* end, const S& comp);
   template <class T, class S>
-  GPUd() static void sortDeviceDynamic(T* begin, T* end, const S& comp);
+  GPUd() static void sortDeviceDynamic(T begin, T end, const S& comp);
 #ifndef __OPENCL__
   template <class T, class S>
   GPUh() static void sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp);
@@ -224,7 +225,7 @@ namespace o2::gpu
 {
 
 template <class T>
-GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end)
+GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T begin, T end)
 {
 #ifndef GPUCA_GPUCODE
   GPUCommonAlgorithm::sort(begin, end);
@@ -234,7 +235,7 @@ GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end)
 }
 
 template <class T, class S>
-GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& comp)
+GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T begin, T end, const S& comp)
 {
   GPUCommonAlgorithm::sort(begin, end, comp);
 }
@@ -248,7 +249,7 @@ namespace o2::gpu
 {
 
 template <class T>
-GPUdi() void GPUCommonAlgorithm::sort(T* begin, T* end)
+GPUdi() void GPUCommonAlgorithm::sort(T begin, T end)
 {
 #ifdef GPUCA_ALGORITHM_STD
   std::sort(begin, end);
@@ -258,7 +259,7 @@ GPUdi() void GPUCommonAlgorithm::sort(T* begin, T* end)
 }
 
 template <class T, class S>
-GPUdi() void GPUCommonAlgorithm::sort(T* begin, T* end, const S& comp)
+GPUdi() void GPUCommonAlgorithm::sort(T begin, T end, const S& comp)
 {
 #ifdef GPUCA_ALGORITHM_STD
   std::sort(begin, end, comp);
 
@@ -25,6 +25,7 @@
 
 #include "GPUCommonDef.h"
 #include "GPUCommonHelpers.h"
+#include "GPUTPCTrack.h"
 
 #ifndef __HIPCC__ // CUDA
 #include <cub/cub.cuh>
@@ -81,19 +82,15 @@ GPUdi() void GPUCommonAlgorithm::sortInBlock(T* begin, T* end, const S& comp)
 */
 
 template <class T>
-GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end)
+GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T begin, T end)
 {
-  thrust::device_ptr<T> thrustBegin(begin);
-  thrust::device_ptr<T> thrustEnd(end);
-  thrust::sort(GPUCA_THRUST_NAMESPACE::par, thrustBegin, thrustEnd);
+  thrust::sort(GPUCA_THRUST_NAMESPACE::par, begin, end);
 }
 
 template <class T, class S>
-GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& comp)
+GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T begin, T end, const S& comp)
 {
-  thrust::device_ptr<T> thrustBegin(begin);
-  thrust::device_ptr<T> thrustEnd(end);
-  thrust::sort(GPUCA_THRUST_NAMESPACE::par, thrustBegin, thrustEnd, comp);
+  thrust::sort(GPUCA_THRUST_NAMESPACE::par, begin, end, comp);
 }
 
 #ifndef GPUCA_GPUCODE_COMPILEKERNELS
Original file line number	Diff line number	Diff line change
`@@ -28,19 +28,20 @@ namespace o2::gpu`
`28`	`28`	`{`
`29`	`29`	`class GPUCommonAlgorithm`
`30`	`30`	`{`
	`31`	`+`
`31`	`32`	`public:`
`32`	`33`	`template <class T>`
`33`		`- GPUd() static void sort(T* begin, T* end);`
	`34`	`+ GPUd() static void sort(T begin, T end);`
`34`	`35`	`template <class T>`
`35`	`36`	`GPUd() static void sortInBlock(T* begin, T* end);`
`36`	`37`	`template <class T>`
`37`		`- GPUd() static void sortDeviceDynamic(T* begin, T* end);`
	`38`	`+ GPUd() static void sortDeviceDynamic(T begin, T end);`
`38`	`39`	`template <class T, class S>`
`39`		`- GPUd() static void sort(T* begin, T* end, const S& comp);`
	`40`	`+ GPUd() static void sort(T begin, T end, const S& comp);`
`40`	`41`	`template <class T, class S>`
`41`	`42`	`GPUd() static void sortInBlock(T* begin, T* end, const S& comp);`
`42`	`43`	`template <class T, class S>`
`43`		`- GPUd() static void sortDeviceDynamic(T* begin, T* end, const S& comp);`
	`44`	`+ GPUd() static void sortDeviceDynamic(T begin, T end, const S& comp);`
`44`	`45`	`#ifndef __OPENCL__`
`45`	`46`	`template <class T, class S>`
`46`	`47`	`GPUh() static void sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp);`
`@@ -224,7 +225,7 @@ namespace o2::gpu`
`224`	`225`	`{`
`225`	`226`
`226`	`227`	`template <class T>`
`227`		`-GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end)`
	`228`	`+GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T begin, T end)`
`228`	`229`	`{`
`229`	`230`	`#ifndef GPUCA_GPUCODE`
`230`	`231`	`GPUCommonAlgorithm::sort(begin, end);`
`@@ -234,7 +235,7 @@ GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end)`
`234`	`235`	`}`
`235`	`236`
`236`	`237`	`template <class T, class S>`
`237`		`-GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& comp)`
	`238`	`+GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T begin, T end, const S& comp)`
`238`	`239`	`{`
`239`	`240`	`GPUCommonAlgorithm::sort(begin, end, comp);`
`240`	`241`	`}`
`@@ -248,7 +249,7 @@ namespace o2::gpu`
`248`	`249`	`{`
`249`	`250`
`250`	`251`	`template <class T>`
`251`		`-GPUdi() void GPUCommonAlgorithm::sort(T* begin, T* end)`
	`252`	`+GPUdi() void GPUCommonAlgorithm::sort(T begin, T end)`
`252`	`253`	`{`
`253`	`254`	`#ifdef GPUCA_ALGORITHM_STD`
`254`	`255`	`std::sort(begin, end);`
`@@ -258,7 +259,7 @@ GPUdi() void GPUCommonAlgorithm::sort(T* begin, T* end)`
`258`	`259`	`}`
`259`	`260`
`260`	`261`	`template <class T, class S>`
`261`		`-GPUdi() void GPUCommonAlgorithm::sort(T* begin, T* end, const S& comp)`
	`262`	`+GPUdi() void GPUCommonAlgorithm::sort(T begin, T end, const S& comp)`
`262`	`263`	`{`
`263`	`264`	`#ifdef GPUCA_ALGORITHM_STD`
`264`	`265`	`std::sort(begin, end, comp);`