99// granted to it by virtue of its status as an Intergovernmental Organization
1010// or submit itself to any jurisdiction.
1111
12- // / \file GPUTPCCFCheckPadBaseline.h
12+ // / \file GPUTPCCFCheckPadBaseline.cxx
1313// / \author Felix Weiglhofer
1414
1515#include " GPUTPCCFCheckPadBaseline.h"
@@ -28,51 +28,82 @@ using namespace o2::gpu::tpccf;
2828template <>
2929GPUd () void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
3030{
31+ #ifdef GPUCA_GPUCODE
32+ CheckBaselineGPU (nBlocks, nThreads, iBlock, iThread, smem, clusterer);
33+ #else
34+ CheckBaselineCPU (nBlocks, nThreads, iBlock, iThread, smem, clusterer);
35+ #endif
36+ }
37+
38+ GPUd () void GPUTPCCFCheckPadBaseline::CheckBaselineGPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
39+ {
40+ #ifdef GPUCA_GPUCODE
41+
3142 const CfFragment& fragment = clusterer.mPmemory ->fragment ;
3243 CfArray2D<PackedCharge> chargeMap (reinterpret_cast <PackedCharge*>(clusterer.mPchargeMap ));
3344
34- int32_t basePad = iBlock * PadsPerCacheline;
35- CfChargePos basePos = padToCfChargePos (basePad, clusterer);
45+ int32_t basePad = iBlock * NumOfCachedPads;
46+ int32_t padsPerRow;
47+ CfChargePos basePos = padToCfChargePos<NumOfCachedPads>(basePad, clusterer, padsPerRow);
3648
3749 if (not basePos.valid ()) {
3850 return ;
3951 }
4052
41- #ifdef GPUCA_GPUCODE
42- static_assert (TPC_MAX_FRAGMENT_LEN_GPU % NumOfCachedTimebins == 0 );
43-
4453 int32_t totalCharges = 0 ;
4554 int32_t consecCharges = 0 ;
4655 int32_t maxConsecCharges = 0 ;
4756 Charge maxCharge = 0 ;
4857
49- int16_t localPadId = iThread / NumOfCachedTimebins;
50- int16_t localTimeBin = iThread % NumOfCachedTimebins;
58+ int16_t iCacheline = iThread / EntriesPerCacheline; // Index of local cacheline
59+ int16_t iCLEntry = iThread % EntriesPerCacheline; // Index within cacheline
60+
61+ int16_t localPadId = iCacheline * PadsPerCacheline + iCLEntry % PadsPerCacheline;
62+ int16_t localTimeBin = iCLEntry / PadsPerCacheline;
5163 bool handlePad = localTimeBin == 0 ;
5264
53- for (tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin (); t < fragment.lastNonOverlapTimeBin (); t += NumOfCachedTimebins) {
54- const CfChargePos pos = basePos.delta ({localPadId, int16_t (t + localTimeBin)});
55- smem.charges [localPadId][localTimeBin] = (pos.valid ()) ? chargeMap[pos].unpack () : 0 ;
65+ for (tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin (); t < fragment.lastNonOverlapTimeBin (); t += TimebinsPerCacheline) {
66+ const CfChargePos pos = basePos.delta ({0 , t});
67+
68+ auto * charges = &chargeMap[pos];
69+
70+ smem.charges [localPadId][localTimeBin] = charges[iThread].unpack ();
71+
5672 GPUbarrier ();
73+
5774 if (handlePad) {
58- for (int32_t i = 0 ; i < NumOfCachedTimebins ; i++) {
75+ for (int32_t i = 0 ; i < TimebinsPerCacheline ; i++) {
5976 const Charge q = smem.charges [localPadId][i];
6077 totalCharges += (q > 0 );
6178 consecCharges = (q > 0 ) ? consecCharges + 1 : 0 ;
6279 maxConsecCharges = CAMath::Max (consecCharges, maxConsecCharges);
6380 maxCharge = CAMath::Max<Charge>(q, maxCharge);
6481 }
6582 }
83+
6684 GPUbarrier ();
6785 }
6886
69- GPUbarrier ();
70-
71- if (handlePad) {
87+ if (handlePad && basePos.pad () + localPadId < padsPerRow) {
7288 updatePadBaseline (basePad + localPadId, clusterer, totalCharges, maxConsecCharges, maxCharge);
7389 }
7490
75- #else // CPU CODE
91+ #endif
92+ }
93+
94+ GPUd () void GPUTPCCFCheckPadBaseline::CheckBaselineCPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
95+ {
96+ #ifndef GPUCA_GPUCODE
97+ const CfFragment& fragment = clusterer.mPmemory ->fragment ;
98+ CfArray2D<PackedCharge> chargeMap (reinterpret_cast <PackedCharge*>(clusterer.mPchargeMap ));
99+
100+ int32_t basePad = iBlock * PadsPerCacheline;
101+ int32_t padsPerRow;
102+ CfChargePos basePos = padToCfChargePos<PadsPerCacheline>(basePad, clusterer, padsPerRow);
103+
104+ if (not basePos.valid ()) {
105+ return ;
106+ }
76107
77108 constexpr size_t ElemsInTileRow = (size_t )TilingLayout<GridSize<2 >>::WidthInTiles * TimebinsPerCacheline * PadsPerCacheline;
78109
@@ -122,22 +153,25 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread
122153#endif
123154}
124155
125- GPUd () CfChargePos GPUTPCCFCheckPadBaseline::padToCfChargePos(int32_t & pad, const GPUTPCClusterFinder& clusterer)
156+ template <int32_t PadsPerBlock>
157+ GPUd () CfChargePos GPUTPCCFCheckPadBaseline::padToCfChargePos(int32_t & pad, const GPUTPCClusterFinder& clusterer, int32_t & padsPerRow)
126158{
127159 constexpr GPUTPCGeometry geo;
128160
129161 int32_t padOffset = 0 ;
130162 for (Row r = 0 ; r < GPUCA_ROW_COUNT ; r++) {
131163 int32_t npads = geo.NPads (r);
132164 int32_t padInRow = pad - padOffset;
133- if (0 <= padInRow && padInRow < CAMath::nextMultipleOf<PadsPerCacheline, int32_t >( npads) ) {
134- int32_t cachelineOffset = padInRow % PadsPerCacheline ;
165+ if (0 <= padInRow && padInRow < npads) {
166+ int32_t cachelineOffset = padInRow % PadsPerBlock ;
135167 pad -= cachelineOffset;
168+ padsPerRow = npads;
136169 return CfChargePos{r, Pad (padInRow - cachelineOffset), 0 };
137170 }
138171 padOffset += npads;
139172 }
140173
174+ padsPerRow = 0 ;
141175 return CfChargePos{0 , 0 , INVALID_TIME_BIN };
142176}
143177
0 commit comments