Skip to content

Commit d07c11a

Browse files
committed
GPU fixup
Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
1 parent bb89f26 commit d07c11a

6 files changed

Lines changed: 104 additions & 269 deletions

File tree

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h

Lines changed: 7 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ class TimeFrameGPU final : public TimeFrame<NLayers>
4444
/// Most relevant operations
4545
void pushMemoryStack(const int);
4646
void popMemoryStack(const int);
47-
void registerHostMemory(const int);
4847
void unregisterHostMemory(const int);
4948
void initialise(const TrackingParameters&, int maxLayers);
5049
void initialise(const TrackingParameters&, int maxLayers, int iteration);
@@ -64,6 +63,7 @@ class TimeFrameGPU final : public TimeFrame<NLayers>
6463
void loadROFrameClustersDevice(const int);
6564
void createROFrameClustersDeviceArray();
6665
void loadROFCutMask(const int);
66+
void loadTrackingParametersDevice(const TrackingParameters&);
6767
void loadVertices();
6868
void loadROFOverlapTable();
6969
void loadROFVertexLookupTable();
@@ -72,29 +72,18 @@ class TimeFrameGPU final : public TimeFrame<NLayers>
7272
///
7373
void createTrackletsLUTDevice(bool, const int);
7474
void createTrackletsLUTDeviceArray();
75-
void loadTrackletsDevice();
76-
void loadTrackletsLUTDevice();
77-
void loadCellsDevice();
78-
void loadCellsLUTDevice();
79-
void loadTrackSeedsDevice();
80-
void loadTrackSeedsChi2Device();
8175
void loadTrackSeedsDevice(bounded_vector<TrackSeedN>&);
8276
void createTrackletsBuffers(const int);
8377
void createTrackletsBuffersArray();
8478
void createCellsBuffers(const int);
8579
void createCellsBuffersArray();
86-
void createCellsDevice();
8780
void createCellsLUTDevice(const int);
8881
void createCellsLUTDeviceArray();
8982
void createNeighboursIndexTablesDevice(const int);
9083
void createNeighboursDevice(const unsigned int layer);
9184
void createNeighboursLUTDevice(const int, const unsigned int);
9285
void createTrackITSExtDevice(const size_t);
9386
void downloadTrackITSExtDevice();
94-
void downloadCellsNeighboursDevice(std::vector<bounded_vector<CellNeighbour>>&, const int);
95-
void downloadNeighboursLUTDevice(bounded_vector<int>&, const int);
96-
void downloadCellsDevice();
97-
void downloadCellsLUTDevice();
9887

9988
/// synchronization
10089
auto& getStream(const size_t stream) { return mGpuStreams[stream]; }
@@ -116,21 +105,15 @@ class TimeFrameGPU final : public TimeFrame<NLayers>
116105
const auto getDeviceROFVertexLookupTableView() { return mDeviceROFVertexLookupTableView; }
117106
const auto getDeviceROFMaskTableView() { return mDeviceROFMaskTableView; }
118107
const auto getDeviceTrackingTopologyView() const { return mDeviceTrackingTopologyView; }
119-
int* getDeviceROFramesClusters(const int layer) { return mROFramesClustersDevice[layer]; }
120108
auto& getTrackITSExt() { return mTrackITSExt; }
121109
Vertex* getDeviceVertices() { return mPrimaryVerticesDevice; }
122110
int* getDeviceROFramesPV() { return mROFramesPVDevice; }
123-
unsigned char* getDeviceUsedClusters(const int);
124-
const o2::base::Propagator* getChainPropagator();
125111

126112
// Hybrid
127113
TrackITSExt* getDeviceTrackITSExt() { return mTrackITSExtDevice; }
128114
int* getDeviceNeighboursLUT(const int layer) { return mNeighboursLUTDevice[layer]; }
129-
gsl::span<int*> getDeviceNeighboursLUTs() { return mNeighboursLUTDevice; }
130115
CellNeighbour** getDeviceArrayNeighbours() { return mNeighboursDeviceArray; }
131-
std::array<CellNeighbour*, MaxCells>& getDeviceNeighboursAll() { return mNeighboursDevice; }
132116
CellNeighbour* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; }
133-
TrackingFrameInfo* getDeviceTrackingFrameInfo(const int);
134117
const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; }
135118
const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; }
136119
const Cluster** getDeviceArrayUnsortedClusters() const { return mUnsortedClustersDeviceArray; }
@@ -146,8 +129,9 @@ class TimeFrameGPU final : public TimeFrame<NLayers>
146129
TrackSeedN* getDeviceTrackSeeds() { return mTrackSeedsDevice; }
147130
int* getDeviceTrackSeedsLUT() { return mTrackSeedsLUTDevice; }
148131
auto getNTrackSeeds() const { return mNTracks; }
149-
o2::track::TrackParCovF** getDeviceArrayTrackSeeds() { return mCellSeedsDeviceArray; }
150-
float** getDeviceArrayTrackSeedsChi2() { return mCellSeedsChi2DeviceArray; }
132+
const float* getDeviceLayerRadii() const { return mLayerRadiiDevice; }
133+
const float* getDeviceMinPts() const { return mMinPtsDevice; }
134+
const float* getDeviceLayerxX0() const { return mLayerxX0Device; }
151135
int* getDeviceNeighboursIndexTables(const int layer) { return mNeighboursIndexTablesDevice[layer]; }
152136

153137
void setDevicePropagator(const o2::base::PropagatorImpl<float>* p) final { this->mPropagatorDevice = p; }
@@ -216,6 +200,9 @@ class TimeFrameGPU final : public TimeFrame<NLayers>
216200
TrackSeedN* mTrackSeedsDevice{nullptr};
217201
int* mTrackSeedsLUTDevice{nullptr};
218202
unsigned int mNTracks{0};
203+
float* mLayerRadiiDevice{nullptr};
204+
float* mMinPtsDevice{nullptr};
205+
float* mLayerxX0Device{nullptr};
219206
std::array<o2::track::TrackParCovF*, MaxCells> mCellSeedsDevice{};
220207
o2::track::TrackParCovF** mCellSeedsDeviceArray;
221208
std::array<float*, MaxCells> mCellSeedsChi2Device{};

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ void countCellsHandler(const Cluster** sortedClusters,
116116
const float maxChi2ClusterAttachment,
117117
const float cellDeltaTanLambdaSigma,
118118
const float nSigmaCut,
119-
const std::vector<float>& layerxX0Host,
119+
const float* layerxX0,
120120
o2::its::ExternalAllocator* alloc,
121121
gpu::Streams& streams);
122122

@@ -136,7 +136,7 @@ void computeCellsHandler(const Cluster** sortedClusters,
136136
const float maxChi2ClusterAttachment,
137137
const float cellDeltaTanLambdaSigma,
138138
const float nSigmaCut,
139-
const std::vector<float>& layerxX0Host,
139+
const float* layerxX0,
140140
gpu::Streams& streams);
141141

142142
template <int NLayers>
@@ -168,12 +168,6 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
168168
const unsigned int nCells,
169169
gpu::Stream& stream);
170170

171-
int filterCellNeighboursHandler(gpuPair<int, int>*,
172-
int*,
173-
unsigned int,
174-
gpu::Stream&,
175-
o2::its::ExternalAllocator* = nullptr);
176-
177171
template <int NLayers>
178172
void processNeighboursHandler(const int startLevel,
179173
const int defaultCellTopologyId,
@@ -192,7 +186,7 @@ void processNeighboursHandler(const int startLevel,
192186
const float maxChi2NDF,
193187
const int maxHoles,
194188
const LayerMask holeLayerMask,
195-
const std::vector<float>& layerxX0Host,
189+
const float* layerxX0,
196190
const o2::base::Propagator* propagator,
197191
const o2::base::PropagatorF::MatCorrType matCorrType,
198192
o2::its::ExternalAllocator* alloc);
@@ -202,9 +196,9 @@ void countTrackSeedHandler(TrackSeed<NLayers>* trackSeeds,
202196
const TrackingFrameInfo** foundTrackingFrameInfo,
203197
const Cluster** unsortedClusters,
204198
int* seedLUT,
205-
const std::vector<float>& layerRadiiHost,
206-
const std::vector<float>& minPtsHost,
207-
const std::vector<float>& layerxX0Host,
199+
const float* layerRadii,
200+
const float* minPts,
201+
const float* layerxX0,
208202
const unsigned int nSeeds,
209203
const float Bz,
210204
const int startLevel,
@@ -223,9 +217,9 @@ void computeTrackSeedHandler(TrackSeed<NLayers>* trackSeeds,
223217
const Cluster** unsortedClusters,
224218
o2::its::TrackITSExt* tracks,
225219
const int* seedLUT,
226-
const std::vector<float>& layerRadiiHost,
227-
const std::vector<float>& minPtsHost,
228-
const std::vector<float>& layerxX0Host,
220+
const float* layerRadii,
221+
const float* minPts,
222+
const float* layerxX0,
229223
const unsigned int nSeeds,
230224
const unsigned int nTracks,
231225
const float Bz,

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
#include <vector>
2020
#include <string>
21-
#include <tuple>
2221

2322
#include "ITStracking/MathUtils.h"
2423
#include "ITStracking/ExternalAllocator.h"
@@ -54,9 +53,6 @@ template <int>
5453
class IndexTableUtils;
5554
class Tracklet;
5655

57-
template <typename T1, typename T2>
58-
using gpuPair = std::pair<T1, T2>;
59-
6056
namespace gpu
6157
{
6258

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu

Lines changed: 18 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212

1313
#include <cuda_runtime.h>
1414

15-
#include <unistd.h>
1615
#include <vector>
1716

1817
#include "ITStrackingGPU/TimeFrameGPU.h"
@@ -63,6 +62,24 @@ void TimeFrameGPU<NLayers>::loadIndexTableUtils()
6362
GPUChkErrS(cudaMemcpy(mIndexTableUtilsDevice, &(this->mIndexTableUtils), sizeof(IndexTableUtilsN), cudaMemcpyHostToDevice));
6463
}
6564

65+
template <int NLayers>
66+
void TimeFrameGPU<NLayers>::loadTrackingParametersDevice(const TrackingParameters& trkParam)
67+
{
68+
GPUTimer timer("loading tracking parameters");
69+
if (mLayerRadiiDevice == nullptr) {
70+
allocMem(reinterpret_cast<void**>(&mLayerRadiiDevice), trkParam.LayerRadii.size() * sizeof(float), this->hasFrameworkAllocator());
71+
}
72+
if (mMinPtsDevice == nullptr) {
73+
allocMem(reinterpret_cast<void**>(&mMinPtsDevice), trkParam.MinPt.size() * sizeof(float), this->hasFrameworkAllocator());
74+
}
75+
if (mLayerxX0Device == nullptr) {
76+
allocMem(reinterpret_cast<void**>(&mLayerxX0Device), trkParam.LayerxX0.size() * sizeof(float), this->hasFrameworkAllocator());
77+
}
78+
GPUChkErrS(cudaMemcpy(mLayerRadiiDevice, trkParam.LayerRadii.data(), trkParam.LayerRadii.size() * sizeof(float), cudaMemcpyHostToDevice));
79+
GPUChkErrS(cudaMemcpy(mMinPtsDevice, trkParam.MinPt.data(), trkParam.MinPt.size() * sizeof(float), cudaMemcpyHostToDevice));
80+
GPUChkErrS(cudaMemcpy(mLayerxX0Device, trkParam.LayerxX0.data(), trkParam.LayerxX0.size() * sizeof(float), cudaMemcpyHostToDevice));
81+
}
82+
6683
template <int NLayers>
6784
void TimeFrameGPU<NLayers>::createUnsortedClustersDeviceArray(const int maxLayers)
6885
{
@@ -420,29 +437,6 @@ void TimeFrameGPU<NLayers>::createTrackletsBuffers(const int layer)
420437
GPUChkErrS(cudaMemcpyAsync(&mTrackletsDeviceArray[layer], &mTrackletsDevice[layer], sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
421438
}
422439

423-
template <int NLayers>
424-
void TimeFrameGPU<NLayers>::loadTrackletsDevice()
425-
{
426-
GPUTimer timer(mGpuStreams, "loading tracklets", NLayers - 1);
427-
for (auto iLayer{0}; iLayer < NLayers - 1; ++iLayer) {
428-
GPULog("gpu-transfer: loading {} tracklets on layer {}, for {:.2f} MB.", this->mTracklets[iLayer].size(), iLayer, this->mTracklets[iLayer].size() * sizeof(Tracklet) / constants::MB);
429-
GPUChkErrS(cudaHostRegister(this->mTracklets[iLayer].data(), this->mTracklets[iLayer].size() * sizeof(Tracklet), cudaHostRegisterPortable));
430-
GPUChkErrS(cudaMemcpyAsync(mTrackletsDevice[iLayer], this->mTracklets[iLayer].data(), this->mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
431-
}
432-
}
433-
434-
template <int NLayers>
435-
void TimeFrameGPU<NLayers>::loadTrackletsLUTDevice()
436-
{
437-
GPUTimer timer("loading tracklets");
438-
for (auto iLayer{0}; iLayer < NLayers - 2; ++iLayer) {
439-
GPULog("gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {:.2f} MB", this->mTrackletsLookupTable[iLayer].size(), iLayer + 1, this->mTrackletsLookupTable[iLayer].size() * sizeof(int) / constants::MB);
440-
GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer + 1], this->mTrackletsLookupTable[iLayer].data(), this->mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
441-
}
442-
mGpuStreams.sync();
443-
GPUChkErrS(cudaMemcpy(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (NLayers - 1) * sizeof(int*), cudaMemcpyHostToDevice));
444-
}
445-
446440
template <int NLayers>
447441
void TimeFrameGPU<NLayers>::createNeighboursIndexTablesDevice(const int layer)
448442
{
@@ -462,19 +456,6 @@ void TimeFrameGPU<NLayers>::createNeighboursLUTDevice(const int layer, const uns
462456
GPUChkErrS(cudaMemcpyAsync(&mNeighboursCellLUTDeviceArray[layer], &mNeighboursLUTDevice[layer], sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
463457
}
464458

465-
template <int NLayers>
466-
void TimeFrameGPU<NLayers>::loadCellsDevice()
467-
{
468-
GPUTimer timer(mGpuStreams, "loading cell seeds", NLayers - 2);
469-
for (auto iLayer{0}; iLayer < NLayers - 2; ++iLayer) {
470-
GPULog("gpu-transfer: loading {} cell seeds on layer {}, for {:.2f} MB.", this->mCells[iLayer].size(), iLayer, this->mCells[iLayer].size() * sizeof(CellSeed) / constants::MB);
471-
allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[iLayer]), this->mCells[iLayer].size() * sizeof(CellSeed), mGpuStreams[iLayer], this->hasFrameworkAllocator());
472-
allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer], this->hasFrameworkAllocator()); // accessory for the neigh. finding.
473-
GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer].get()));
474-
GPUChkErrS(cudaMemcpyAsync(mCellsDevice[iLayer], this->mCells[iLayer].data(), this->mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
475-
}
476-
}
477-
478459
template <int NLayers>
479460
void TimeFrameGPU<NLayers>::createCellsLUTDeviceArray()
480461
{
@@ -523,17 +504,6 @@ void TimeFrameGPU<NLayers>::createCellsBuffers(const int layer)
523504
GPUChkErrS(cudaMemcpyAsync(&mCellsDeviceArray[layer], &mCellsDevice[layer], sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
524505
}
525506

526-
template <int NLayers>
527-
void TimeFrameGPU<NLayers>::loadCellsLUTDevice()
528-
{
529-
GPUTimer timer(mGpuStreams, "loading cells LUTs", NLayers - 3);
530-
for (auto iLayer{0}; iLayer < NLayers - 3; ++iLayer) {
531-
GPULog("gpu-transfer: loading cell LUT for {} elements on layer {}, for {:.2f} MB.", this->mCellsLookupTable[iLayer].size(), iLayer, this->mCellsLookupTable[iLayer].size() * sizeof(int) / constants::MB);
532-
GPUChkErrS(cudaHostRegister(this->mCellsLookupTable[iLayer].data(), this->mCellsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable));
533-
GPUChkErrS(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], this->mCellsLookupTable[iLayer].data(), this->mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
534-
}
535-
}
536-
537507
template <int NLayers>
538508
void TimeFrameGPU<NLayers>::loadTrackSeedsDevice(bounded_vector<TrackSeedN>& seeds)
539509
{
@@ -581,44 +551,6 @@ void TimeFrameGPU<NLayers>::createTrackITSExtDevice(const size_t nSeeds)
581551
GPUChkErrS(cudaMemset(mTrackITSExtDevice, 0, mNTracks * sizeof(o2::its::TrackITSExt)));
582552
}
583553

584-
template <int NLayers>
585-
void TimeFrameGPU<NLayers>::downloadCellsDevice()
586-
{
587-
GPUTimer timer(mGpuStreams, "downloading cells", NLayers - 2);
588-
for (int iLayer{0}; iLayer < NLayers - 2; ++iLayer) {
589-
GPULog("gpu-transfer: downloading {} cells on layer: {}, for {:.2f} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / constants::MB);
590-
this->mCells[iLayer].resize(mNCells[iLayer]);
591-
GPUChkErrS(cudaMemcpyAsync(this->mCells[iLayer].data(), this->mCellsDevice[iLayer], mNCells[iLayer] * sizeof(CellSeed), cudaMemcpyDeviceToHost, mGpuStreams[iLayer].get()));
592-
}
593-
}
594-
595-
template <int NLayers>
596-
void TimeFrameGPU<NLayers>::downloadCellsLUTDevice()
597-
{
598-
GPUTimer timer(mGpuStreams, "downloading cell luts", NLayers - 3);
599-
for (auto iLayer{0}; iLayer < NLayers - 3; ++iLayer) {
600-
GPULog("gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mNTracklets[iLayer + 1] + 1));
601-
this->mCellsLookupTable[iLayer].resize(mNTracklets[iLayer + 1] + 1);
602-
GPUChkErrS(cudaMemcpyAsync(this->mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mNTracklets[iLayer + 1] + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[iLayer].get()));
603-
}
604-
}
605-
606-
template <int NLayers>
607-
void TimeFrameGPU<NLayers>::downloadCellsNeighboursDevice(std::vector<bounded_vector<CellNeighbour>>& neighbours, const int layer)
608-
{
609-
GPUTimer timer(mGpuStreams[layer], "downloading neighbours from layer", layer);
610-
GPULog("gpu-transfer: downloading {} neighbours, for {:.2f} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(CellNeighbour) / constants::MB);
611-
GPUChkErrS(cudaMemcpyAsync(neighbours[layer].data(), mNeighboursDevice[layer], neighbours[layer].size() * sizeof(CellNeighbour), cudaMemcpyDeviceToHost, mGpuStreams[layer].get()));
612-
}
613-
614-
template <int NLayers>
615-
void TimeFrameGPU<NLayers>::downloadNeighboursLUTDevice(bounded_vector<int>& lut, const int layer)
616-
{
617-
GPUTimer timer(mGpuStreams[layer], "downloading neighbours LUT from layer", layer);
618-
GPULog("gpu-transfer: downloading neighbours LUT for {} elements on layer {}, for {:.2f} MB.", lut.size(), layer, lut.size() * sizeof(int) / constants::MB);
619-
GPUChkErrS(cudaMemcpyAsync(lut.data(), mNeighboursLUTDevice[layer], lut.size() * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get()));
620-
}
621-
622554
template <int NLayers>
623555
void TimeFrameGPU<NLayers>::downloadTrackITSExtDevice()
624556
{

0 commit comments

Comments
 (0)