File indexing completed on 2023-07-17 02:54:07
0001 #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
0002 #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
0003
0004 #include <cstdint>
0005 #include <cstdio>
0006
0007 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
0008 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
0010 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
0011 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0012
0013
0014
0015 namespace gpuClustering {
0016
0017
0018 constexpr uint32_t pixelSizeX = 160;
0019 constexpr uint32_t pixelSizeY = 416;
0020
0021 namespace pixelStatus {
0022
0023 enum Status : uint32_t { kEmpty = 0x00, kFound = 0x01, kDuplicate = 0x03 };
0024
0025 constexpr uint32_t bits = 2;
0026 constexpr uint32_t mask = (0x01 << bits) - 1;
0027 constexpr uint32_t valuesPerWord = sizeof(uint32_t) * 8 / bits;
0028 constexpr uint32_t size = pixelSizeX * pixelSizeY / valuesPerWord;
0029
0030 __device__ static constexpr inline uint32_t getIndex(uint16_t x, uint16_t y) {
0031 return (pixelSizeX * y + x) / valuesPerWord;
0032 }
0033
0034 __device__ constexpr inline uint32_t getShift(uint16_t x, uint16_t y) { return (x % valuesPerWord) * 2; }
0035
0036 __device__ constexpr inline Status getStatus(uint32_t const* __restrict__ status, uint16_t x, uint16_t y) {
0037 uint32_t index = getIndex(x, y);
0038 uint32_t shift = getShift(x, y);
0039 return Status{(status[index] >> shift) & mask};
0040 }
0041
0042 __device__ constexpr inline bool isDuplicate(uint32_t const* __restrict__ status, uint16_t x, uint16_t y) {
0043 return getStatus(status, x, y) == kDuplicate;
0044 }
0045
0046 __device__ constexpr inline void promote(uint32_t* __restrict__ status, const uint16_t x, const uint16_t y) {
0047 uint32_t index = getIndex(x, y);
0048 uint32_t shift = getShift(x, y);
0049 uint32_t old_word = status[index];
0050 uint32_t expected = old_word;
0051 do {
0052 expected = old_word;
0053 Status old_status{(old_word >> shift) & mask};
0054 if (kDuplicate == old_status) {
0055
0056 return;
0057 }
0058 Status new_status = (kEmpty == old_status) ? kFound : kDuplicate;
0059 uint32_t new_word = old_word | (static_cast<uint32_t>(new_status) << shift);
0060 old_word = atomicCAS(&status[index], expected, new_word);
0061 } while (expected != old_word);
0062 }
0063
0064 }
0065
0066 #ifdef GPU_DEBUG
0067 __device__ uint32_t gMaxHit = 0;
0068 #endif
0069
0070 template <typename TrackerTraits>
0071 __global__ void countModules(uint16_t const* __restrict__ id,
0072 uint32_t* __restrict__ moduleStart,
0073 int32_t* __restrict__ clusterId,
0074 int numElements) {
0075 int first = blockDim.x * blockIdx.x + threadIdx.x;
0076
0077 [[maybe_unused]] constexpr int nMaxModules = TrackerTraits::numberOfModules;
0078
0079 assert(nMaxModules < maxNumModules);
0080 for (int i = first; i < numElements; i += gridDim.x * blockDim.x) {
0081 clusterId[i] = i;
0082 if (invalidModuleId == id[i])
0083 continue;
0084 auto j = i - 1;
0085 while (j >= 0 and id[j] == invalidModuleId)
0086 --j;
0087 if (j < 0 or id[j] != id[i]) {
0088
0089 auto loc = atomicInc(moduleStart, nMaxModules);
0090 moduleStart[loc + 1] = i;
0091 }
0092 }
0093 }
0094
0095 template <typename TrackerTraits>
0096 __global__ void findClus(uint32_t* __restrict__ rawIdArr,
0097 uint16_t* __restrict__ id,
0098 uint16_t const* __restrict__ x,
0099 uint16_t const* __restrict__ y,
0100 uint32_t const* __restrict__ moduleStart,
0101 uint32_t* __restrict__ nClustersInModule,
0102 uint32_t* __restrict__ moduleId,
0103 int32_t* __restrict__ clusterId,
0104 int numElements) {
0105
0106
0107 constexpr bool isPhase2 = std::is_base_of<pixelTopology::Phase2, TrackerTraits>::value;
0108 constexpr const uint32_t pixelStatusSize = isPhase2 ? 1 : pixelStatus::size;
0109 __shared__ uint32_t status[pixelStatusSize];
0110 __shared__ int msize;
0111
0112 auto firstModule = blockIdx.x;
0113 auto endModule = moduleStart[0];
0114
0115 assert(TrackerTraits::numberOfModules < maxNumModules);
0116
0117 for (auto module = firstModule; module < endModule; module += gridDim.x) {
0118 auto firstPixel = moduleStart[1 + module];
0119 auto thisModuleId = id[firstPixel];
0120 assert(thisModuleId < TrackerTraits::numberOfModules);
0121
0122 #ifdef GPU_DEBUG
0123 if (thisModuleId % 100 == 1)
0124 if (threadIdx.x == 0)
0125 printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
0126 #endif
0127
0128 auto first = firstPixel + threadIdx.x;
0129
0130
0131 msize = numElements;
0132 __syncthreads();
0133
0134
0135 for (int i = first; i < numElements; i += blockDim.x) {
0136 if (id[i] == invalidModuleId)
0137 continue;
0138 if (id[i] != thisModuleId) {
0139 atomicMin(&msize, i);
0140 break;
0141 }
0142 }
0143
0144
0145
0146 constexpr uint32_t maxPixInModule = TrackerTraits::maxPixInModule;
0147 constexpr auto nbins = TrackerTraits::clusterBinning;
0148 constexpr auto nbits = TrackerTraits::clusterBits;
0149
0150 using Hist = cms::cuda::HistoContainer<uint16_t, nbins, maxPixInModule, nbits, uint16_t>;
0151 __shared__ Hist hist;
0152 __shared__ typename Hist::Counter ws[32];
0153 for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
0154 hist.off[j] = 0;
0155 }
0156 __syncthreads();
0157
0158 assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
0159
0160
0161 if (0 == threadIdx.x) {
0162 if (msize - firstPixel > maxPixInModule) {
0163 printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule);
0164 msize = maxPixInModule + firstPixel;
0165 }
0166 #ifdef GPU_DEBUG
0167 printf("pixelInModule > %d\n", msize - firstPixel);
0168 #endif
0169 }
0170
0171 __syncthreads();
0172 assert(msize - firstPixel <= maxPixInModule);
0173
0174 #ifdef GPU_DEBUG
0175 __shared__ uint32_t totGood;
0176 totGood = 0;
0177 __syncthreads();
0178 #endif
0179
0180
0181 if constexpr (not isPhase2) {
0182 if (msize > 1) {
0183 for (uint32_t i = threadIdx.x; i < pixelStatus::size; i += blockDim.x) {
0184 status[i] = 0;
0185 }
0186 __syncthreads();
0187 for (int i = first; i < msize - 1; i += blockDim.x) {
0188
0189 if (id[i] == invalidModuleId)
0190 continue;
0191 pixelStatus::promote(status, x[i], y[i]);
0192 }
0193 __syncthreads();
0194 for (int i = first; i < msize - 1; i += blockDim.x) {
0195
0196 if (id[i] == invalidModuleId)
0197 continue;
0198 if (pixelStatus::isDuplicate(status, x[i], y[i])) {
0199
0200 id[i] = invalidModuleId;
0201 rawIdArr[i] = 0;
0202 }
0203 }
0204 __syncthreads();
0205 }
0206 }
0207
0208
0209 for (int i = first; i < msize; i += blockDim.x) {
0210 if (id[i] == invalidModuleId)
0211 continue;
0212 hist.count(y[i]);
0213 #ifdef GPU_DEBUG
0214 atomicAdd(&totGood, 1);
0215 #endif
0216 }
0217 __syncthreads();
0218 if (threadIdx.x < 32)
0219 ws[threadIdx.x] = 0;
0220 __syncthreads();
0221 hist.finalize(ws);
0222 __syncthreads();
0223 #ifdef GPU_DEBUG
0224 assert(hist.size() == totGood);
0225 if (thisModuleId % 100 == 1)
0226 if (threadIdx.x == 0)
0227 printf("histo size %d\n", hist.size());
0228 #endif
0229 for (int i = first; i < msize; i += blockDim.x) {
0230 if (id[i] == invalidModuleId)
0231 continue;
0232 hist.fill(y[i], i - firstPixel);
0233 }
0234
0235 #ifdef __CUDA_ARCH__
0236
0237 constexpr int maxiter = 16;
0238 if (threadIdx.x == 0 && (hist.size() / blockDim.x) >= maxiter)
0239 printf("THIS IS NOT SUPPOSED TO HAPPEN too many hits in module %d: %d for block size %d\n",
0240 thisModuleId,
0241 hist.size(),
0242 blockDim.x);
0243 #else
0244 auto maxiter = hist.size();
0245 #endif
0246
0247 constexpr int maxNeighbours = 10;
0248 assert((hist.size() / blockDim.x) <= maxiter);
0249
0250 uint16_t nn[maxiter][maxNeighbours];
0251 uint8_t nnn[maxiter];
0252 for (uint32_t k = 0; k < maxiter; ++k)
0253 nnn[k] = 0;
0254
0255 __syncthreads();
0256
0257 #ifdef GPU_DEBUG
0258
0259 __shared__ uint32_t n40, n60;
0260 n40 = n60 = 0;
0261 __syncthreads();
0262 for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) {
0263 if (hist.size(j) > 60)
0264 atomicAdd(&n60, 1);
0265 if (hist.size(j) > 40)
0266 atomicAdd(&n40, 1);
0267 }
0268 __syncthreads();
0269 if (0 == threadIdx.x) {
0270 if (n60 > 0)
0271 printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
0272 else if (n40 > 0)
0273 printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
0274 }
0275 __syncthreads();
0276 #endif
0277
0278
0279 for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
0280 assert(k < maxiter);
0281 auto p = hist.begin() + j;
0282 auto i = *p + firstPixel;
0283 assert(id[i] != invalidModuleId);
0284 assert(id[i] == thisModuleId);
0285 int be = Hist::bin(y[i] + 1);
0286 auto e = hist.end(be);
0287 ++p;
0288 assert(0 == nnn[k]);
0289 for (; p < e; ++p) {
0290 auto m = (*p) + firstPixel;
0291 assert(m != i);
0292 assert(int(y[m]) - int(y[i]) >= 0);
0293 assert(int(y[m]) - int(y[i]) <= 1);
0294 if (std::abs(int(x[m]) - int(x[i])) > 1)
0295 continue;
0296 auto l = nnn[k]++;
0297 assert(l < maxNeighbours);
0298 nn[k][l] = *p;
0299 }
0300 }
0301
0302
0303
0304
0305
0306 bool more = true;
0307 int nloops = 0;
0308 while (__syncthreads_or(more)) {
0309 if (1 == nloops % 2) {
0310 for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
0311 auto p = hist.begin() + j;
0312 auto i = *p + firstPixel;
0313 auto m = clusterId[i];
0314 while (m != clusterId[m])
0315 m = clusterId[m];
0316 clusterId[i] = m;
0317 }
0318 } else {
0319 more = false;
0320 for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
0321 auto p = hist.begin() + j;
0322 auto i = *p + firstPixel;
0323 for (int kk = 0; kk < nnn[k]; ++kk) {
0324 auto l = nn[k][kk];
0325 auto m = l + firstPixel;
0326 assert(m != i);
0327 auto old = atomicMin_block(&clusterId[m], clusterId[i]);
0328
0329 if (old != clusterId[i]) {
0330
0331 more = true;
0332 }
0333 atomicMin_block(&clusterId[i], old);
0334 }
0335 }
0336 }
0337 ++nloops;
0338 }
0339
0340 #ifdef GPU_DEBUG
0341 {
0342 __shared__ int n0;
0343 if (threadIdx.x == 0)
0344 n0 = nloops;
0345 __syncthreads();
0346 auto ok = n0 == nloops;
0347 assert(__syncthreads_and(ok));
0348 if (thisModuleId % 100 == 1)
0349 if (threadIdx.x == 0)
0350 printf("# loops %d\n", nloops);
0351 }
0352 #endif
0353
0354 __shared__ unsigned int foundClusters;
0355 foundClusters = 0;
0356 __syncthreads();
0357
0358
0359
0360 for (int i = first; i < msize; i += blockDim.x) {
0361 if (id[i] == invalidModuleId)
0362 continue;
0363 if (clusterId[i] == i) {
0364 auto old = atomicInc(&foundClusters, 0xffffffff);
0365 clusterId[i] = -(old + 1);
0366 }
0367 }
0368 __syncthreads();
0369
0370
0371 for (int i = first; i < msize; i += blockDim.x) {
0372 if (id[i] == invalidModuleId)
0373 continue;
0374 if (clusterId[i] >= 0) {
0375
0376 clusterId[i] = clusterId[clusterId[i]];
0377 }
0378 }
0379 __syncthreads();
0380
0381
0382 for (int i = first; i < msize; i += blockDim.x) {
0383 if (id[i] == invalidModuleId) {
0384 clusterId[i] = invalidClusterId;
0385 continue;
0386 }
0387 clusterId[i] = -clusterId[i] - 1;
0388 }
0389 __syncthreads();
0390
0391 if (threadIdx.x == 0) {
0392 nClustersInModule[thisModuleId] = foundClusters;
0393 moduleId[module] = thisModuleId;
0394 #ifdef GPU_DEBUG
0395 if (foundClusters > gMaxHit) {
0396 gMaxHit = foundClusters;
0397 if (foundClusters > 8)
0398 printf("max hit %d in %d\n", foundClusters, thisModuleId);
0399 }
0400 #endif
0401 #ifdef GPU_DEBUG
0402 if (thisModuleId % 100 == 1)
0403 printf("%d clusters in module %d\n", foundClusters, thisModuleId);
0404 #endif
0405 }
0406 }
0407 }
0408 }
0409
0410 #endif