Skip to content

Commit dc58aef

Browse files
committed
Refactor Pad Filter.
1 parent e5c677c commit dc58aef

4 files changed

Lines changed: 64 additions & 23 deletions

File tree

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -962,7 +962,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
962962
checkForNoisyPads &= !GetProcessingSettings().disableTPCNoisyPadFilter;
963963

964964
if (checkForNoisyPads) {
965-
int32_t nBlocks = TPC_PADS_IN_SECTOR / GPUTPCCFCheckPadBaseline::PadsPerCacheline;
965+
const int32_t padsPerBlock = doGPU ? 16 : 8; // FIXME: Don't hardcode this!!!
966+
const int32_t nBlocks = TPC_PADS_IN_SECTOR / padsPerBlock;
966967

967968
runKernel<GPUTPCCFCheckPadBaseline>({GetGridBlk(nBlocks, lane), {iSector}});
968969
getKernelTimer<GPUTPCCFCheckPadBaseline>(RecoStep::TPCClusterFinding, iSector, TPC_PADS_IN_SECTOR * fragment.lengthWithoutOverlap() * sizeof(PackedCharge), false);

GPU/GPUTracking/TPCClusterFinder/CfArray2D.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ using TPCMapMemoryLayout = TilingLayout<GridSize<sizeof(T)>>;
116116
#else
117117
template <typename T>
118118
using TPCMapMemoryLayout = LinearLayout;
119+
#error "The frick u doing"
119120
#endif
120121

121122
template <typename T>

GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx

Lines changed: 53 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
// granted to it by virtue of its status as an Intergovernmental Organization
1010
// or submit itself to any jurisdiction.
1111

12-
/// \file GPUTPCCFCheckPadBaseline.h
12+
/// \file GPUTPCCFCheckPadBaseline.cxx
1313
/// \author Felix Weiglhofer
1414

1515
#include "GPUTPCCFCheckPadBaseline.h"
@@ -28,51 +28,82 @@ using namespace o2::gpu::tpccf;
2828
template <>
2929
GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
3030
{
31+
#ifdef GPUCA_GPUCODE
32+
CheckBaselineGPU(nBlocks, nThreads, iBlock, iThread, smem, clusterer);
33+
#else
34+
CheckBaselineCPU(nBlocks, nThreads, iBlock, iThread, smem, clusterer);
35+
#endif
36+
}
37+
38+
GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineGPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
39+
{
40+
#ifdef GPUCA_GPUCODE
41+
3142
const CfFragment& fragment = clusterer.mPmemory->fragment;
3243
CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
3344

34-
int32_t basePad = iBlock * PadsPerCacheline;
35-
CfChargePos basePos = padToCfChargePos(basePad, clusterer);
45+
int32_t basePad = iBlock * NumOfCachedPads;
46+
int32_t padsPerRow;
47+
CfChargePos basePos = padToCfChargePos<NumOfCachedPads>(basePad, clusterer, padsPerRow);
3648

3749
if (not basePos.valid()) {
3850
return;
3951
}
4052

41-
#ifdef GPUCA_GPUCODE
42-
static_assert(TPC_MAX_FRAGMENT_LEN_GPU % NumOfCachedTimebins == 0);
43-
4453
int32_t totalCharges = 0;
4554
int32_t consecCharges = 0;
4655
int32_t maxConsecCharges = 0;
4756
Charge maxCharge = 0;
4857

49-
int16_t localPadId = iThread / NumOfCachedTimebins;
50-
int16_t localTimeBin = iThread % NumOfCachedTimebins;
58+
int16_t iCacheline = iThread / EntriesPerCacheline; // Index of local cacheline
59+
int16_t iCLEntry = iThread % EntriesPerCacheline; // Index within cacheline
60+
61+
int16_t localPadId = iCacheline * PadsPerCacheline + iCLEntry % PadsPerCacheline;
62+
int16_t localTimeBin = iCLEntry / PadsPerCacheline;
5163
bool handlePad = localTimeBin == 0;
5264

53-
for (tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin(); t < fragment.lastNonOverlapTimeBin(); t += NumOfCachedTimebins) {
54-
const CfChargePos pos = basePos.delta({localPadId, int16_t(t + localTimeBin)});
55-
smem.charges[localPadId][localTimeBin] = (pos.valid()) ? chargeMap[pos].unpack() : 0;
65+
for (tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin(); t < fragment.lastNonOverlapTimeBin(); t += TimebinsPerCacheline) {
66+
const CfChargePos pos = basePos.delta({0, t});
67+
68+
auto* charges = &chargeMap[pos];
69+
70+
smem.charges[localPadId][localTimeBin] = charges[iThread].unpack();
71+
5672
GPUbarrier();
73+
5774
if (handlePad) {
58-
for (int32_t i = 0; i < NumOfCachedTimebins; i++) {
75+
for (int32_t i = 0; i < TimebinsPerCacheline; i++) {
5976
const Charge q = smem.charges[localPadId][i];
6077
totalCharges += (q > 0);
6178
consecCharges = (q > 0) ? consecCharges + 1 : 0;
6279
maxConsecCharges = CAMath::Max(consecCharges, maxConsecCharges);
6380
maxCharge = CAMath::Max<Charge>(q, maxCharge);
6481
}
6582
}
83+
6684
GPUbarrier();
6785
}
6886

69-
GPUbarrier();
70-
71-
if (handlePad) {
87+
if (handlePad && basePos.pad() + localPadId < padsPerRow) {
7288
updatePadBaseline(basePad + localPadId, clusterer, totalCharges, maxConsecCharges, maxCharge);
7389
}
7490

75-
#else // CPU CODE
91+
#endif
92+
}
93+
94+
GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineCPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
95+
{
96+
#ifndef GPUCA_GPUCODE
97+
const CfFragment& fragment = clusterer.mPmemory->fragment;
98+
CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
99+
100+
int32_t basePad = iBlock * PadsPerCacheline;
101+
int32_t padsPerRow;
102+
CfChargePos basePos = padToCfChargePos<PadsPerCacheline>(basePad, clusterer, padsPerRow);
103+
104+
if (not basePos.valid()) {
105+
return;
106+
}
76107

77108
constexpr size_t ElemsInTileRow = (size_t)TilingLayout<GridSize<2>>::WidthInTiles * TimebinsPerCacheline * PadsPerCacheline;
78109

@@ -122,22 +153,25 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread
122153
#endif
123154
}
124155

125-
GPUd() CfChargePos GPUTPCCFCheckPadBaseline::padToCfChargePos(int32_t& pad, const GPUTPCClusterFinder& clusterer)
156+
template <int32_t PadsPerBlock>
157+
GPUd() CfChargePos GPUTPCCFCheckPadBaseline::padToCfChargePos(int32_t& pad, const GPUTPCClusterFinder& clusterer, int32_t& padsPerRow)
126158
{
127159
constexpr GPUTPCGeometry geo;
128160

129161
int32_t padOffset = 0;
130162
for (Row r = 0; r < GPUCA_ROW_COUNT; r++) {
131163
int32_t npads = geo.NPads(r);
132164
int32_t padInRow = pad - padOffset;
133-
if (0 <= padInRow && padInRow < CAMath::nextMultipleOf<PadsPerCacheline, int32_t>(npads)) {
134-
int32_t cachelineOffset = padInRow % PadsPerCacheline;
165+
if (0 <= padInRow && padInRow < npads) {
166+
int32_t cachelineOffset = padInRow % PadsPerBlock;
135167
pad -= cachelineOffset;
168+
padsPerRow = npads;
136169
return CfChargePos{r, Pad(padInRow - cachelineOffset), 0};
137170
}
138171
padOffset += npads;
139172
}
140173

174+
padsPerRow = 0;
141175
return CfChargePos{0, 0, INVALID_TIME_BIN};
142176
}
143177

GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,12 @@ class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate
3030
enum {
3131
PadsPerCacheline = 8,
3232
TimebinsPerCacheline = 4,
33-
NumOfCachedTimebins = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFCheckPadBaseline) / PadsPerCacheline,
33+
EntriesPerCacheline = PadsPerCacheline * TimebinsPerCacheline,
34+
NumOfCachedPads = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFCheckPadBaseline) / TimebinsPerCacheline,
3435
};
3536

3637
struct GPUSharedMemory {
37-
tpccf::Charge charges[PadsPerCacheline][NumOfCachedTimebins];
38+
tpccf::Charge charges[NumOfCachedPads][TimebinsPerCacheline];
3839
};
3940

4041
typedef GPUTPCClusterFinder processorType;
@@ -52,7 +53,11 @@ class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate
5253
GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer);
5354

5455
private:
55-
GPUd() static CfChargePos padToCfChargePos(int32_t& pad, const GPUTPCClusterFinder&);
56+
GPUd() static void CheckBaselineGPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer);
57+
GPUd() static void CheckBaselineCPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer);
58+
59+
template <int32_t PadsPerBlock>
60+
GPUd() static CfChargePos padToCfChargePos(int32_t& pad, const GPUTPCClusterFinder&, int32_t& padsPerRow);
5661
GPUd() static void updatePadBaseline(int32_t pad, const GPUTPCClusterFinder&, int32_t totalCharges, int32_t consecCharges, tpccf::Charge maxCharge);
5762
};
5863

0 commit comments

Comments
 (0)