File indexing completed on 2024-04-06 12:26:19
0001 #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
0002 #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
0003
0004 #include <cstdint>
0005 #include <cstdio>
0006
0007 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
0008 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0010 #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
0011 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelClusterThresholds.h"
0012
0013 namespace gpuClustering {
0014
0015 template <typename TrackerTraits>
0016 __global__ void clusterChargeCut(
0017 SiPixelClusterThresholds
0018 clusterThresholds,
0019 uint16_t* __restrict__ id,
0020 uint16_t const* __restrict__ adc,
0021 uint32_t const* __restrict__ moduleStart,
0022 uint32_t* __restrict__ nClustersInModule,
0023 uint32_t const* __restrict__ moduleId,
0024 int32_t* __restrict__ clusterId,
0025 uint32_t numElements) {
0026 constexpr int32_t maxNumClustersPerModules = TrackerTraits::maxNumClustersPerModules;
0027
0028 __shared__ int32_t charge[maxNumClustersPerModules];
0029 __shared__ uint8_t ok[maxNumClustersPerModules];
0030 __shared__ uint16_t newclusId[maxNumClustersPerModules];
0031
0032 constexpr int startBPIX2 = TrackerTraits::layerStart[1];
0033
0034 assert(TrackerTraits::numberOfModules < maxNumModules);
0035 assert(startBPIX2 < TrackerTraits::numberOfModules);
0036
0037 auto firstModule = blockIdx.x;
0038 auto endModule = moduleStart[0];
0039 for (auto module = firstModule; module < endModule; module += gridDim.x) {
0040 auto firstPixel = moduleStart[1 + module];
0041 auto thisModuleId = id[firstPixel];
0042 while (thisModuleId == invalidModuleId and firstPixel < numElements) {
0043
0044 ++firstPixel;
0045 thisModuleId = id[firstPixel];
0046 }
0047 if (firstPixel >= numElements) {
0048
0049 break;
0050 }
0051 if (thisModuleId != moduleId[module]) {
0052
0053 continue;
0054 }
0055 assert(thisModuleId < TrackerTraits::numberOfModules);
0056
0057 auto nclus = nClustersInModule[thisModuleId];
0058 if (nclus == 0)
0059 continue;
0060
0061 if (threadIdx.x == 0 && nclus > maxNumClustersPerModules)
0062 printf("Warning too many clusters in module %d in block %d: %d > %d\n",
0063 thisModuleId,
0064 blockIdx.x,
0065 nclus,
0066 maxNumClustersPerModules);
0067
0068 auto first = firstPixel + threadIdx.x;
0069
0070 if (nclus > maxNumClustersPerModules) {
0071
0072 for (auto i = first; i < numElements; i += blockDim.x) {
0073 if (id[i] == invalidModuleId)
0074 continue;
0075 if (id[i] != thisModuleId)
0076 break;
0077 if (clusterId[i] >= maxNumClustersPerModules) {
0078 id[i] = invalidModuleId;
0079 clusterId[i] = invalidModuleId;
0080 }
0081 }
0082 nclus = maxNumClustersPerModules;
0083 }
0084
0085 #ifdef GPU_DEBUG
0086 if (thisModuleId % 100 == 1)
0087 if (threadIdx.x == 0)
0088 printf("start cluster charge cut for module %d in block %d\n", thisModuleId, blockIdx.x);
0089 #endif
0090
0091 assert(nclus <= maxNumClustersPerModules);
0092 for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
0093 charge[i] = 0;
0094 }
0095 __syncthreads();
0096
0097 for (auto i = first; i < numElements; i += blockDim.x) {
0098 if (id[i] == invalidModuleId)
0099 continue;
0100 if (id[i] != thisModuleId)
0101 break;
0102 atomicAdd(&charge[clusterId[i]], adc[i]);
0103 }
0104 __syncthreads();
0105
0106 auto chargeCut = clusterThresholds.getThresholdForLayerOnCondition(thisModuleId < startBPIX2);
0107
0108 bool good = true;
0109 for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
0110 newclusId[i] = ok[i] = charge[i] >= chargeCut ? 1 : 0;
0111 if (0 == ok[i])
0112 good = false;
0113 }
0114
0115
0116 if (__syncthreads_and(good))
0117 continue;
0118
0119
0120 __shared__ uint16_t ws[32];
0121 constexpr auto maxThreads = 1024;
0122 auto minClust = nclus > maxThreads ? maxThreads : nclus;
0123
0124 cms::cuda::blockPrefixScan(newclusId, newclusId, minClust, ws);
0125 if constexpr (maxNumClustersPerModules > maxThreads)
0126 {
0127 for (uint32_t offset = maxThreads; offset < nclus; offset += maxThreads) {
0128 cms::cuda::blockPrefixScan(newclusId + offset, newclusId + offset, nclus - offset, ws);
0129 for (uint32_t i = threadIdx.x + offset; i < nclus; i += blockDim.x) {
0130 uint32_t prevBlockEnd = ((i / maxThreads) * maxThreads) - 1;
0131 newclusId[i] += newclusId[prevBlockEnd];
0132 }
0133 __syncthreads();
0134 }
0135 }
0136 assert(nclus > newclusId[nclus - 1]);
0137
0138 nClustersInModule[thisModuleId] = newclusId[nclus - 1];
0139
0140
0141 for (auto i = first; i < numElements; i += blockDim.x) {
0142 if (id[i] == invalidModuleId)
0143 continue;
0144 if (id[i] != thisModuleId)
0145 break;
0146 if (0 == ok[clusterId[i]])
0147 clusterId[i] = id[i] = invalidModuleId;
0148 else
0149 clusterId[i] = newclusId[clusterId[i]] - 1;
0150 }
0151
0152
0153 __syncthreads();
0154 }
0155 }
0156
0157 }
0158
0159 #endif