SiPixelClusterizer/plugins/gpuClustering.h

0001 #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
0002 #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
0003
0004 #include <cstdint>
0005 #include <cstdio>
0006
0007 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
0008 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
0010 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
0011 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0012
0013 //#define GPU_DEBUG
0014
0015 namespace gpuClustering {
0016
0017   // Phase-1 pixel modules
0018   constexpr uint32_t pixelSizeX = 160;
0019   constexpr uint32_t pixelSizeY = 416;
0020
0021   namespace pixelStatus {
0022     // Use 0x00, 0x01, 0x03 so each can be OR'ed on top of the previous ones
0023     enum Status : uint32_t { kEmpty = 0x00, kFound = 0x01, kDuplicate = 0x03 };
0024
0025     constexpr uint32_t bits = 2;
0026     constexpr uint32_t mask = (0x01 << bits) - 1;
0027     constexpr uint32_t valuesPerWord = sizeof(uint32_t) * 8 / bits;
0028     constexpr uint32_t size = pixelSizeX * pixelSizeY / valuesPerWord;
0029
0030     __device__ static constexpr inline uint32_t getIndex(uint16_t x, uint16_t y) {
0031       return (pixelSizeX * y + x) / valuesPerWord;
0032     }
0033
0034     __device__ constexpr inline uint32_t getShift(uint16_t x, uint16_t y) { return (x % valuesPerWord) * 2; }
0035
0036     __device__ constexpr inline Status getStatus(uint32_t const* __restrict__ status, uint16_t x, uint16_t y) {
0037       uint32_t index = getIndex(x, y);
0038       uint32_t shift = getShift(x, y);
0039       return Status{(status[index] >> shift) & mask};
0040     }
0041
0042     __device__ constexpr inline bool isDuplicate(uint32_t const* __restrict__ status, uint16_t x, uint16_t y) {
0043       return getStatus(status, x, y) == kDuplicate;
0044     }
0045
0046     __device__ constexpr inline void promote(uint32_t* __restrict__ status, const uint16_t x, const uint16_t y) {
0047       uint32_t index = getIndex(x, y);
0048       uint32_t shift = getShift(x, y);
0049       uint32_t old_word = status[index];
0050       uint32_t expected = old_word;
0051       do {
0052         expected = old_word;
0053         Status old_status{(old_word >> shift) & mask};
0054         if (kDuplicate == old_status) {
0055           // nothing to do
0056           return;
0057         }
0058         Status new_status = (kEmpty == old_status) ? kFound : kDuplicate;
0059         uint32_t new_word = old_word | (static_cast<uint32_t>(new_status) << shift);
0060         old_word = atomicCAS(&status[index], expected, new_word);
0061       } while (expected != old_word);
0062     }
0063
0064   }  // namespace pixelStatus
0065
0066 #ifdef GPU_DEBUG
0067   __device__ uint32_t gMaxHit = 0;
0068 #endif
0069
0070   template <typename TrackerTraits>
0071   __global__ void countModules(uint16_t const* __restrict__ id,
0072                                uint32_t* __restrict__ moduleStart,
0073                                int32_t* __restrict__ clusterId,
0074                                int numElements) {
0075     int first = blockDim.x * blockIdx.x + threadIdx.x;
0076
0077     [[maybe_unused]] constexpr int nMaxModules = TrackerTraits::numberOfModules;
0078
0079     assert(nMaxModules < maxNumModules);
0080     for (int i = first; i < numElements; i += gridDim.x * blockDim.x) {
0081       clusterId[i] = i;
0082       if (invalidModuleId == id[i])
0083         continue;
0084       auto j = i - 1;
0085       while (j >= 0 and id[j] == invalidModuleId)
0086         --j;
0087       if (j < 0 or id[j] != id[i]) {
0088         // boundary...
0089         auto loc = atomicInc(moduleStart, nMaxModules);
0090         moduleStart[loc + 1] = i;
0091       }
0092     }
0093   }
0094
0095   template <typename TrackerTraits>
0096   __global__ void findClus(uint32_t* __restrict__ rawIdArr,
0097                            uint16_t* __restrict__ id,                 // module id of each pixel
0098                            uint16_t const* __restrict__ x,            // local coordinates of each pixel
0099                            uint16_t const* __restrict__ y,            //
0100                            uint32_t const* __restrict__ moduleStart,  // index of the first pixel of each module
0101                            uint32_t* __restrict__ nClustersInModule,  // output: number of clusters found in each module
0102                            uint32_t* __restrict__ moduleId,           // output: module id of each module
0103                            int32_t* __restrict__ clusterId,           // output: cluster id of each pixel
0104                            int numElements) {
0105     // status is only used for Phase-1, but it cannot be declared conditionally only if isPhase2 is false;
0106     // to minimize the impact on Phase-2 reconstruction it is declared with a very small size.
0107     constexpr bool isPhase2 = std::is_base_of<pixelTopology::Phase2, TrackerTraits>::value;
0108     constexpr const uint32_t pixelStatusSize = isPhase2 ? 1 : pixelStatus::size;
0109     __shared__ uint32_t status[pixelStatusSize];  // packed words array used to store the PixelStatus of each pixel
0110     __shared__ int msize;
0111
0112     auto firstModule = blockIdx.x;
0113     auto endModule = moduleStart[0];
0114
0115     assert(TrackerTraits::numberOfModules < maxNumModules);
0116
0117     for (auto module = firstModule; module < endModule; module += gridDim.x) {
0118       auto firstPixel = moduleStart[1 + module];
0119       auto thisModuleId = id[firstPixel];
0120       assert(thisModuleId < TrackerTraits::numberOfModules);
0121
0122 #ifdef GPU_DEBUG
0123       if (thisModuleId % 100 == 1)
0124         if (threadIdx.x == 0)
0125           printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
0126 #endif
0127
0128       auto first = firstPixel + threadIdx.x;
0129
0130       // find the index of the first pixel not belonging to this module (or invalid)
0131       msize = numElements;
0132       __syncthreads();
0133
0134       // skip threads not associated to an existing pixel
0135       for (int i = first; i < numElements; i += blockDim.x) {
0136         if (id[i] == invalidModuleId)  // skip invalid pixels
0137           continue;
0138         if (id[i] != thisModuleId) {  // find the first pixel in a different module
0139           atomicMin(&msize, i);
0140           break;
0141         }
0142       }
0143
0144       //init hist  (ymax=416 < 512 : 9bits)
0145       //6000 max pixels required for HI operations with no measurable impact on pp performance
0146       constexpr uint32_t maxPixInModule = TrackerTraits::maxPixInModule;
0147       constexpr auto nbins = TrackerTraits::clusterBinning;
0148       constexpr auto nbits = TrackerTraits::clusterBits;
0149
0150       using Hist = cms::cuda::HistoContainer<uint16_t, nbins, maxPixInModule, nbits, uint16_t>;
0151       __shared__ Hist hist;
0152       __shared__ typename Hist::Counter ws[32];
0153       for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
0154         hist.off[j] = 0;
0155       }
0156       __syncthreads();
0157
0158       assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
0159
0160       // limit to maxPixInModule  (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
0161       if (0 == threadIdx.x) {
0162         if (msize - firstPixel > maxPixInModule) {
0163           printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule);
0164           msize = maxPixInModule + firstPixel;
0165         }
0166 #ifdef GPU_DEBUG
0167         printf("pixelInModule > %d\n", msize - firstPixel);
0168 #endif
0169       }
0170
0171       __syncthreads();
0172       assert(msize - firstPixel <= maxPixInModule);
0173
0174 #ifdef GPU_DEBUG
0175       __shared__ uint32_t totGood;
0176       totGood = 0;
0177       __syncthreads();
0178 #endif
0179
0180       // remove duplicate pixels
0181       if constexpr (not isPhase2) {
0182         if (msize > 1) {
0183           for (uint32_t i = threadIdx.x; i < pixelStatus::size; i += blockDim.x) {
0184             status[i] = 0;
0185           }
0186           __syncthreads();
0187           for (int i = first; i < msize - 1; i += blockDim.x) {
0188             // skip invalid pixels
0189             if (id[i] == invalidModuleId)
0190               continue;
0191             pixelStatus::promote(status, x[i], y[i]);
0192           }
0193           __syncthreads();
0194           for (int i = first; i < msize - 1; i += blockDim.x) {
0195             // skip invalid pixels
0196             if (id[i] == invalidModuleId)
0197               continue;
0198             if (pixelStatus::isDuplicate(status, x[i], y[i])) {
0199               // printf("found dup %d %d %d %d\n", i, id[i], x[i], y[i]);
0200               id[i] = invalidModuleId;
0201               rawIdArr[i] = 0;
0202             }
0203           }
0204           __syncthreads();
0205         }
0206       }
0207
0208       // fill histo
0209       for (int i = first; i < msize; i += blockDim.x) {
0210         if (id[i] == invalidModuleId)  // skip invalid pixels
0211           continue;
0212         hist.count(y[i]);
0213 #ifdef GPU_DEBUG
0214         atomicAdd(&totGood, 1);
0215 #endif
0216       }
0217       __syncthreads();
0218       if (threadIdx.x < 32)
0219         ws[threadIdx.x] = 0;  // used by prefix scan...
0220       __syncthreads();
0221       hist.finalize(ws);
0222       __syncthreads();
0223 #ifdef GPU_DEBUG
0224       assert(hist.size() == totGood);
0225       if (thisModuleId % 100 == 1)
0226         if (threadIdx.x == 0)
0227           printf("histo size %d\n", hist.size());
0228 #endif
0229       for (int i = first; i < msize; i += blockDim.x) {
0230         if (id[i] == invalidModuleId)  // skip invalid pixels
0231           continue;
0232         hist.fill(y[i], i - firstPixel);
0233       }
0234
0235 #ifdef __CUDA_ARCH__
0236       // assume that we can cover the whole module with up to 16 blockDim.x-wide iterations
0237       constexpr int maxiter = 16;
0238       if (threadIdx.x == 0 && (hist.size() / blockDim.x) >= maxiter)
0239         printf("THIS IS NOT SUPPOSED TO HAPPEN too many hits in module %d: %d for block size %d\n",
0240                thisModuleId,
0241                hist.size(),
0242                blockDim.x);
0243 #else
0244       auto maxiter = hist.size();
0245 #endif
0246       // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
0247       constexpr int maxNeighbours = 10;
0248       assert((hist.size() / blockDim.x) <= maxiter);
0249       // nearest neighbour
0250       uint16_t nn[maxiter][maxNeighbours];
0251       uint8_t nnn[maxiter];  // number of nn
0252       for (uint32_t k = 0; k < maxiter; ++k)
0253         nnn[k] = 0;
0254
0255       __syncthreads();  // for hit filling!
0256
0257 #ifdef GPU_DEBUG
0258       // look for anomalous high occupancy
0259       __shared__ uint32_t n40, n60;
0260       n40 = n60 = 0;
0261       __syncthreads();
0262       for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) {
0263         if (hist.size(j) > 60)
0264           atomicAdd(&n60, 1);
0265         if (hist.size(j) > 40)
0266           atomicAdd(&n40, 1);
0267       }
0268       __syncthreads();
0269       if (0 == threadIdx.x) {
0270         if (n60 > 0)
0271           printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
0272         else if (n40 > 0)
0273           printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
0274       }
0275       __syncthreads();
0276 #endif
0277
0278       // fill NN
0279       for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
0280         assert(k < maxiter);
0281         auto p = hist.begin() + j;
0282         auto i = *p + firstPixel;
0283         assert(id[i] != invalidModuleId);
0284         assert(id[i] == thisModuleId);  // same module
0285         int be = Hist::bin(y[i] + 1);
0286         auto e = hist.end(be);
0287         ++p;
0288         assert(0 == nnn[k]);
0289         for (; p < e; ++p) {
0290           auto m = (*p) + firstPixel;
0291           assert(m != i);
0292           assert(int(y[m]) - int(y[i]) >= 0);
0293           assert(int(y[m]) - int(y[i]) <= 1);
0294           if (std::abs(int(x[m]) - int(x[i])) > 1)
0295             continue;
0296           auto l = nnn[k]++;
0297           assert(l < maxNeighbours);
0298           nn[k][l] = *p;
0299         }
0300       }
0301
0302       // for each pixel, look at all the pixels until the end of the module;
0303       // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
0304       // after the loop, all the pixel in each cluster should have the id equeal to the lowest
0305       // pixel in the cluster ( clus[i] == i ).
0306       bool more = true;
0307       int nloops = 0;
0308       while (__syncthreads_or(more)) {
0309         if (1 == nloops % 2) {
0310           for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
0311             auto p = hist.begin() + j;
0312             auto i = *p + firstPixel;
0313             auto m = clusterId[i];
0314             while (m != clusterId[m])
0315               m = clusterId[m];
0316             clusterId[i] = m;
0317           }
0318         } else {
0319           more = false;
0320           for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
0321             auto p = hist.begin() + j;
0322             auto i = *p + firstPixel;
0323             for (int kk = 0; kk < nnn[k]; ++kk) {
0324               auto l = nn[k][kk];
0325               auto m = l + firstPixel;
0326               assert(m != i);
0327               auto old = atomicMin_block(&clusterId[m], clusterId[i]);
0328               // do we need memory fence?
0329               if (old != clusterId[i]) {
0330                 // end the loop only if no changes were applied
0331                 more = true;
0332               }
0333               atomicMin_block(&clusterId[i], old);
0334             }  // nnloop
0335           }    // pixel loop
0336         }
0337         ++nloops;
0338       }  // end while
0339
0340 #ifdef GPU_DEBUG
0341       {
0342         __shared__ int n0;
0343         if (threadIdx.x == 0)
0344           n0 = nloops;
0345         __syncthreads();
0346         auto ok = n0 == nloops;
0347         assert(__syncthreads_and(ok));
0348         if (thisModuleId % 100 == 1)
0349           if (threadIdx.x == 0)
0350             printf("# loops %d\n", nloops);
0351       }
0352 #endif
0353
0354       __shared__ unsigned int foundClusters;
0355       foundClusters = 0;
0356       __syncthreads();
0357
0358       // find the number of different clusters, identified by a pixels with clus[i] == i;
0359       // mark these pixels with a negative id.
0360       for (int i = first; i < msize; i += blockDim.x) {
0361         if (id[i] == invalidModuleId)  // skip invalid pixels
0362           continue;
0363         if (clusterId[i] == i) {
0364           auto old = atomicInc(&foundClusters, 0xffffffff);
0365           clusterId[i] = -(old + 1);
0366         }
0367       }
0368       __syncthreads();
0369
0370       // propagate the negative id to all the pixels in the cluster.
0371       for (int i = first; i < msize; i += blockDim.x) {
0372         if (id[i] == invalidModuleId)  // skip invalid pixels
0373           continue;
0374         if (clusterId[i] >= 0) {
0375           // mark each pixel in a cluster with the same id as the first one
0376           clusterId[i] = clusterId[clusterId[i]];
0377         }
0378       }
0379       __syncthreads();
0380
0381       // adjust the cluster id to be a positive value starting from 0
0382       for (int i = first; i < msize; i += blockDim.x) {
0383         if (id[i] == invalidModuleId) {  // skip invalid pixels
0384           clusterId[i] = invalidClusterId;
0385           continue;
0386         }
0387         clusterId[i] = -clusterId[i] - 1;
0388       }
0389       __syncthreads();
0390
0391       if (threadIdx.x == 0) {
0392         nClustersInModule[thisModuleId] = foundClusters;
0393         moduleId[module] = thisModuleId;
0394 #ifdef GPU_DEBUG
0395         if (foundClusters > gMaxHit) {
0396           gMaxHit = foundClusters;
0397           if (foundClusters > 8)
0398             printf("max hit %d in %d\n", foundClusters, thisModuleId);
0399         }
0400 #endif
0401 #ifdef GPU_DEBUG
0402         if (thisModuleId % 100 == 1)
0403           printf("%d clusters in module %d\n", foundClusters, thisModuleId);
0404 #endif
0405       }
0406     }  // module loop
0407   }
0408 }  // namespace gpuClustering
0409
0410 #endif  // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h