Refactor Pad Filter.

fweig · fweig · commit dc58aef1d62e · 2026-01-22T16:36:03.000+01:00
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -962,7 +962,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         checkForNoisyPads &= !GetProcessingSettings().disableTPCNoisyPadFilter;
 
         if (checkForNoisyPads) {
-          int32_t nBlocks = TPC_PADS_IN_SECTOR / GPUTPCCFCheckPadBaseline::PadsPerCacheline;
+          const int32_t padsPerBlock = doGPU ? 16 : 8; // FIXME: Don't hardcode this!!!
+          const int32_t nBlocks = TPC_PADS_IN_SECTOR / padsPerBlock;
 
           runKernel<GPUTPCCFCheckPadBaseline>({GetGridBlk(nBlocks, lane), {iSector}});
           getKernelTimer<GPUTPCCFCheckPadBaseline>(RecoStep::TPCClusterFinding, iSector, TPC_PADS_IN_SECTOR * fragment.lengthWithoutOverlap() * sizeof(PackedCharge), false);
diff --git a/GPU/GPUTracking/TPCClusterFinder/CfArray2D.h b/GPU/GPUTracking/TPCClusterFinder/CfArray2D.h
@@ -116,6 +116,7 @@ using TPCMapMemoryLayout = TilingLayout<GridSize<sizeof(T)>>;
 #else
 template <typename T>
 using TPCMapMemoryLayout = LinearLayout;
+#error "The frick u doing"
 #endif
 
 template <typename T>
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
@@ -9,7 +9,7 @@
 // granted to it by virtue of its status as an Intergovernmental Organization
 // or submit itself to any jurisdiction.
 
-/// \file GPUTPCCFCheckPadBaseline.h
+/// \file GPUTPCCFCheckPadBaseline.cxx
 /// \author Felix Weiglhofer
 
 #include "GPUTPCCFCheckPadBaseline.h"
@@ -28,51 +28,82 @@ using namespace o2::gpu::tpccf;
 template <>
 GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
 {
+#ifdef GPUCA_GPUCODE
+  CheckBaselineGPU(nBlocks, nThreads, iBlock, iThread, smem, clusterer);
+#else
+  CheckBaselineCPU(nBlocks, nThreads, iBlock, iThread, smem, clusterer);
+#endif
+}
+
+GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineGPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
+{
+#ifdef GPUCA_GPUCODE
+
   const CfFragment& fragment = clusterer.mPmemory->fragment;
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
 
-  int32_t basePad = iBlock * PadsPerCacheline;
-  CfChargePos basePos = padToCfChargePos(basePad, clusterer);
+  int32_t basePad = iBlock * NumOfCachedPads;
+  int32_t padsPerRow;
+  CfChargePos basePos = padToCfChargePos<NumOfCachedPads>(basePad, clusterer, padsPerRow);
 
   if (not basePos.valid()) {
     return;
   }
 
-#ifdef GPUCA_GPUCODE
-  static_assert(TPC_MAX_FRAGMENT_LEN_GPU % NumOfCachedTimebins == 0);
-
   int32_t totalCharges = 0;
   int32_t consecCharges = 0;
   int32_t maxConsecCharges = 0;
   Charge maxCharge = 0;
 
-  int16_t localPadId = iThread / NumOfCachedTimebins;
-  int16_t localTimeBin = iThread % NumOfCachedTimebins;
+  int16_t iCacheline = iThread / EntriesPerCacheline; // Index of local cacheline
+  int16_t iCLEntry = iThread % EntriesPerCacheline;   // Index within cacheline
+
+  int16_t localPadId = iCacheline * PadsPerCacheline + iCLEntry % PadsPerCacheline;
+  int16_t localTimeBin = iCLEntry / PadsPerCacheline;
   bool handlePad = localTimeBin == 0;
 
-  for (tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin(); t < fragment.lastNonOverlapTimeBin(); t += NumOfCachedTimebins) {
-    const CfChargePos pos = basePos.delta({localPadId, int16_t(t + localTimeBin)});
-    smem.charges[localPadId][localTimeBin] = (pos.valid()) ? chargeMap[pos].unpack() : 0;
+  for (tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin(); t < fragment.lastNonOverlapTimeBin(); t += TimebinsPerCacheline) {
+    const CfChargePos pos = basePos.delta({0, t});
+
+    auto* charges = &chargeMap[pos];
+
+    smem.charges[localPadId][localTimeBin] = charges[iThread].unpack();
+
     GPUbarrier();
+
     if (handlePad) {
-      for (int32_t i = 0; i < NumOfCachedTimebins; i++) {
+      for (int32_t i = 0; i < TimebinsPerCacheline; i++) {
         const Charge q = smem.charges[localPadId][i];
         totalCharges += (q > 0);
         consecCharges = (q > 0) ? consecCharges + 1 : 0;
         maxConsecCharges = CAMath::Max(consecCharges, maxConsecCharges);
         maxCharge = CAMath::Max<Charge>(q, maxCharge);
       }
     }
+
     GPUbarrier();
   }
 
-  GPUbarrier();
-
-  if (handlePad) {
+  if (handlePad && basePos.pad() + localPadId < padsPerRow) {
     updatePadBaseline(basePad + localPadId, clusterer, totalCharges, maxConsecCharges, maxCharge);
   }
 
-#else // CPU CODE
+#endif
+}
+
+GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineCPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
+{
+#ifndef GPUCA_GPUCODE
+  const CfFragment& fragment = clusterer.mPmemory->fragment;
+  CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+
+  int32_t basePad = iBlock * PadsPerCacheline;
+  int32_t padsPerRow;
+  CfChargePos basePos = padToCfChargePos<PadsPerCacheline>(basePad, clusterer, padsPerRow);
+
+  if (not basePos.valid()) {
+    return;
+  }
 
   constexpr size_t ElemsInTileRow = (size_t)TilingLayout<GridSize<2>>::WidthInTiles * TimebinsPerCacheline * PadsPerCacheline;
 
@@ -122,22 +153,25 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread
 #endif
 }
 
-GPUd() CfChargePos GPUTPCCFCheckPadBaseline::padToCfChargePos(int32_t& pad, const GPUTPCClusterFinder& clusterer)
+template <int32_t PadsPerBlock>
+GPUd() CfChargePos GPUTPCCFCheckPadBaseline::padToCfChargePos(int32_t& pad, const GPUTPCClusterFinder& clusterer, int32_t& padsPerRow)
 {
   constexpr GPUTPCGeometry geo;
 
   int32_t padOffset = 0;
   for (Row r = 0; r < GPUCA_ROW_COUNT; r++) {
     int32_t npads = geo.NPads(r);
     int32_t padInRow = pad - padOffset;
-    if (0 <= padInRow && padInRow < CAMath::nextMultipleOf<PadsPerCacheline, int32_t>(npads)) {
-      int32_t cachelineOffset = padInRow % PadsPerCacheline;
+    if (0 <= padInRow && padInRow < npads) {
+      int32_t cachelineOffset = padInRow % PadsPerBlock;
       pad -= cachelineOffset;
+      padsPerRow = npads;
       return CfChargePos{r, Pad(padInRow - cachelineOffset), 0};
     }
     padOffset += npads;
   }
 
+  padsPerRow = 0;
   return CfChargePos{0, 0, INVALID_TIME_BIN};
 }
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h
@@ -30,11 +30,12 @@ class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate
   enum {
     PadsPerCacheline = 8,
     TimebinsPerCacheline = 4,
-    NumOfCachedTimebins = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFCheckPadBaseline) / PadsPerCacheline,
+    EntriesPerCacheline = PadsPerCacheline * TimebinsPerCacheline,
+    NumOfCachedPads = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFCheckPadBaseline) / TimebinsPerCacheline,
   };
 
   struct GPUSharedMemory {
-    tpccf::Charge charges[PadsPerCacheline][NumOfCachedTimebins];
+    tpccf::Charge charges[NumOfCachedPads][TimebinsPerCacheline];
   };
 
   typedef GPUTPCClusterFinder processorType;
@@ -52,7 +53,11 @@ class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate
   GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer);
 
  private:
-  GPUd() static CfChargePos padToCfChargePos(int32_t& pad, const GPUTPCClusterFinder&);
+  GPUd() static void CheckBaselineGPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer);
+  GPUd() static void CheckBaselineCPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer);
+
+  template <int32_t PadsPerBlock>
+  GPUd() static CfChargePos padToCfChargePos(int32_t& pad, const GPUTPCClusterFinder&, int32_t& padsPerRow);
   GPUd() static void updatePadBaseline(int32_t pad, const GPUTPCClusterFinder&, int32_t totalCharges, int32_t consecCharges, tpccf::Charge maxCharge);
 };