Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2022-08-04 22:45:25

0001 #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
0002 #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
0003 
0004 #include <cstdint>
0005 #include <cstdio>
0006 
0007 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
0008 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
0010 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
0011 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0012 
0013 namespace gpuClustering {
0014 
0015   // Phase-1 pixel modules
0016   constexpr uint32_t pixelSizeX = 160;
0017   constexpr uint32_t pixelSizeY = 416;
0018 
0019   namespace pixelStatus {
0020     // Use 0x00, 0x01, 0x03 so each can be OR'ed on top of the previous ones
0021     enum Status : uint32_t { kEmpty = 0x00, kFound = 0x01, kDuplicate = 0x03 };
0022 
0023     constexpr uint32_t bits = 2;
0024     constexpr uint32_t mask = (0x01 << bits) - 1;
0025     constexpr uint32_t valuesPerWord = sizeof(uint32_t) * 8 / bits;
0026     constexpr uint32_t size = pixelSizeX * pixelSizeY / valuesPerWord;
0027 
0028     __device__ static constexpr inline uint32_t getIndex(uint16_t x, uint16_t y) {
0029       return (pixelSizeX * y + x) / valuesPerWord;
0030     }
0031 
0032     __device__ constexpr inline uint32_t getShift(uint16_t x, uint16_t y) { return (x % valuesPerWord) * 2; }
0033 
0034     __device__ constexpr inline Status getStatus(uint32_t const* __restrict__ status, uint16_t x, uint16_t y) {
0035       uint32_t index = getIndex(x, y);
0036       uint32_t shift = getShift(x, y);
0037       return Status{(status[index] >> shift) & mask};
0038     }
0039 
0040     __device__ constexpr inline bool isDuplicate(uint32_t const* __restrict__ status, uint16_t x, uint16_t y) {
0041       return getStatus(status, x, y) == kDuplicate;
0042     }
0043 
0044     __device__ constexpr inline void promote(uint32_t* __restrict__ status, const uint16_t x, const uint16_t y) {
0045       uint32_t index = getIndex(x, y);
0046       uint32_t shift = getShift(x, y);
0047       uint32_t old_word = status[index];
0048       uint32_t expected = old_word;
0049       do {
0050         expected = old_word;
0051         Status old_status{(old_word >> shift) & mask};
0052         if (kDuplicate == old_status) {
0053           // nothing to do
0054           return;
0055         }
0056         Status new_status = (kEmpty == old_status) ? kFound : kDuplicate;
0057         uint32_t new_word = old_word | (static_cast<uint32_t>(new_status) << shift);
0058         old_word = atomicCAS(&status[index], expected, new_word);
0059       } while (expected != old_word);
0060     }
0061 
0062   }  // namespace pixelStatus
0063 
0064 #ifdef GPU_DEBUG
0065   __device__ uint32_t gMaxHit = 0;
0066 #endif
0067 
0068   template <bool isPhase2>
0069   __global__ void countModules(uint16_t const* __restrict__ id,
0070                                uint32_t* __restrict__ moduleStart,
0071                                int32_t* __restrict__ clusterId,
0072                                int numElements) {
0073     int first = blockDim.x * blockIdx.x + threadIdx.x;
0074     [[maybe_unused]] constexpr int nMaxModules =
0075         isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
0076     assert(nMaxModules < maxNumModules);
0077     for (int i = first; i < numElements; i += gridDim.x * blockDim.x) {
0078       clusterId[i] = i;
0079       if (invalidModuleId == id[i])
0080         continue;
0081       auto j = i - 1;
0082       while (j >= 0 and id[j] == invalidModuleId)
0083         --j;
0084       if (j < 0 or id[j] != id[i]) {
0085         // boundary...
0086         auto loc = atomicInc(moduleStart, nMaxModules);
0087         moduleStart[loc + 1] = i;
0088       }
0089     }
0090   }
0091 
0092   template <bool isPhase2>
0093   __global__ void findClus(uint32_t* __restrict__ rawIdArr,
0094                            uint16_t* __restrict__ id,                 // module id of each pixel
0095                            uint16_t const* __restrict__ x,            // local coordinates of each pixel
0096                            uint16_t const* __restrict__ y,            //
0097                            uint32_t const* __restrict__ moduleStart,  // index of the first pixel of each module
0098                            uint32_t* __restrict__ nClustersInModule,  // output: number of clusters found in each module
0099                            uint32_t* __restrict__ moduleId,           // output: module id of each module
0100                            int32_t* __restrict__ clusterId,           // output: cluster id of each pixel
0101                            int numElements) {
0102     // status is only used for Phase-1, but it cannot be declared conditionally only if isPhase2 is false;
0103     // to minimize the impact on Phase-2 reconstruction it is declared with a very small size.
0104     constexpr const uint32_t pixelStatusSize = isPhase2 ? 1 : pixelStatus::size;
0105     __shared__ uint32_t status[pixelStatusSize];  // packed words array used to store the PixelStatus of each pixel
0106     __shared__ int msize;
0107 
0108     auto firstModule = blockIdx.x;
0109     auto endModule = moduleStart[0];
0110 
0111     [[maybe_unused]] constexpr int nMaxModules =
0112         isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
0113     assert(nMaxModules < maxNumModules);
0114 
0115     for (auto module = firstModule; module < endModule; module += gridDim.x) {
0116       auto firstPixel = moduleStart[1 + module];
0117       auto thisModuleId = id[firstPixel];
0118       assert(thisModuleId < nMaxModules);
0119 
0120 #ifdef GPU_DEBUG
0121       if (thisModuleId % 100 == 1)
0122         if (threadIdx.x == 0)
0123           printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
0124 #endif
0125 
0126       auto first = firstPixel + threadIdx.x;
0127 
0128       // find the index of the first pixel not belonging to this module (or invalid)
0129       msize = numElements;
0130       __syncthreads();
0131 
0132       // skip threads not associated to an existing pixel
0133       for (int i = first; i < numElements; i += blockDim.x) {
0134         if (id[i] == invalidModuleId)  // skip invalid pixels
0135           continue;
0136         if (id[i] != thisModuleId) {  // find the first pixel in a different module
0137           atomicMin(&msize, i);
0138           break;
0139         }
0140       }
0141 
0142       //init hist  (ymax=416 < 512 : 9bits)
0143       //6000 max pixels required for HI operations with no measurable impact on pp performance
0144       constexpr uint32_t maxPixInModule = 6000;
0145       constexpr auto nbins = isPhase2 ? 1024 : phase1PixelTopology::numColsInModule + 2;  //2+2;
0146       constexpr auto nbits = isPhase2 ? 10 : 9;                                           //2+2;
0147       using Hist = cms::cuda::HistoContainer<uint16_t, nbins, maxPixInModule, nbits, uint16_t>;
0148       __shared__ Hist hist;
0149       __shared__ typename Hist::Counter ws[32];
0150       for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
0151         hist.off[j] = 0;
0152       }
0153       __syncthreads();
0154 
0155       assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
0156 
0157       // limit to maxPixInModule  (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
0158       if (0 == threadIdx.x) {
0159         if (msize - firstPixel > maxPixInModule) {
0160           printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule);
0161           msize = maxPixInModule + firstPixel;
0162         }
0163       }
0164 
0165       __syncthreads();
0166       assert(msize - firstPixel <= maxPixInModule);
0167 
0168 #ifdef GPU_DEBUG
0169       __shared__ uint32_t totGood;
0170       totGood = 0;
0171       __syncthreads();
0172 #endif
0173 
0174       // remove duplicate pixels
0175       if constexpr (not isPhase2) {
0176         if (msize > 1) {
0177           for (uint32_t i = threadIdx.x; i < pixelStatus::size; i += blockDim.x) {
0178             status[i] = 0;
0179           }
0180           __syncthreads();
0181           for (int i = first; i < msize - 1; i += blockDim.x) {
0182             // skip invalid pixels
0183             if (id[i] == invalidModuleId)
0184               continue;
0185             pixelStatus::promote(status, x[i], y[i]);
0186           }
0187           __syncthreads();
0188           for (int i = first; i < msize - 1; i += blockDim.x) {
0189             // skip invalid pixels
0190             if (id[i] == invalidModuleId)
0191               continue;
0192             if (pixelStatus::isDuplicate(status, x[i], y[i])) {
0193               // printf("found dup %d %d %d %d\n", i, id[i], x[i], y[i]);
0194               id[i] = invalidModuleId;
0195               rawIdArr[i] = 0;
0196             }
0197           }
0198           __syncthreads();
0199         }
0200       }
0201 
0202       // fill histo
0203       for (int i = first; i < msize; i += blockDim.x) {
0204         if (id[i] == invalidModuleId)  // skip invalid pixels
0205           continue;
0206         hist.count(y[i]);
0207 #ifdef GPU_DEBUG
0208         atomicAdd(&totGood, 1);
0209 #endif
0210       }
0211       __syncthreads();
0212       if (threadIdx.x < 32)
0213         ws[threadIdx.x] = 0;  // used by prefix scan...
0214       __syncthreads();
0215       hist.finalize(ws);
0216       __syncthreads();
0217 #ifdef GPU_DEBUG
0218       assert(hist.size() == totGood);
0219       if (thisModuleId % 100 == 1)
0220         if (threadIdx.x == 0)
0221           printf("histo size %d\n", hist.size());
0222 #endif
0223       for (int i = first; i < msize; i += blockDim.x) {
0224         if (id[i] == invalidModuleId)  // skip invalid pixels
0225           continue;
0226         hist.fill(y[i], i - firstPixel);
0227       }
0228 
0229 #ifdef __CUDA_ARCH__
0230       // assume that we can cover the whole module with up to 16 blockDim.x-wide iterations
0231       constexpr int maxiter = 16;
0232       if (threadIdx.x == 0 && (hist.size() / blockDim.x) >= maxiter)
0233         printf("THIS IS NOT SUPPOSED TO HAPPEN too many hits in module %d: %d for block size %d\n",
0234                thisModuleId,
0235                hist.size(),
0236                blockDim.x);
0237 #else
0238       auto maxiter = hist.size();
0239 #endif
0240       // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
0241       constexpr int maxNeighbours = 10;
0242       assert((hist.size() / blockDim.x) <= maxiter);
0243       // nearest neighbour
0244       uint16_t nn[maxiter][maxNeighbours];
0245       uint8_t nnn[maxiter];  // number of nn
0246       for (uint32_t k = 0; k < maxiter; ++k)
0247         nnn[k] = 0;
0248 
0249       __syncthreads();  // for hit filling!
0250 
0251 #ifdef GPU_DEBUG
0252       // look for anomalous high occupancy
0253       __shared__ uint32_t n40, n60;
0254       n40 = n60 = 0;
0255       __syncthreads();
0256       for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) {
0257         if (hist.size(j) > 60)
0258           atomicAdd(&n60, 1);
0259         if (hist.size(j) > 40)
0260           atomicAdd(&n40, 1);
0261       }
0262       __syncthreads();
0263       if (0 == threadIdx.x) {
0264         if (n60 > 0)
0265           printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
0266         else if (n40 > 0)
0267           printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
0268       }
0269       __syncthreads();
0270 #endif
0271 
0272       // fill NN
0273       for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
0274         assert(k < maxiter);
0275         auto p = hist.begin() + j;
0276         auto i = *p + firstPixel;
0277         assert(id[i] != invalidModuleId);
0278         assert(id[i] == thisModuleId);  // same module
0279         int be = Hist::bin(y[i] + 1);
0280         auto e = hist.end(be);
0281         ++p;
0282         assert(0 == nnn[k]);
0283         for (; p < e; ++p) {
0284           auto m = (*p) + firstPixel;
0285           assert(m != i);
0286           assert(int(y[m]) - int(y[i]) >= 0);
0287           assert(int(y[m]) - int(y[i]) <= 1);
0288           if (std::abs(int(x[m]) - int(x[i])) > 1)
0289             continue;
0290           auto l = nnn[k]++;
0291           assert(l < maxNeighbours);
0292           nn[k][l] = *p;
0293         }
0294       }
0295 
0296       // for each pixel, look at all the pixels until the end of the module;
0297       // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
0298       // after the loop, all the pixel in each cluster should have the id equeal to the lowest
0299       // pixel in the cluster ( clus[i] == i ).
0300       bool more = true;
0301       int nloops = 0;
0302       while (__syncthreads_or(more)) {
0303         if (1 == nloops % 2) {
0304           for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
0305             auto p = hist.begin() + j;
0306             auto i = *p + firstPixel;
0307             auto m = clusterId[i];
0308             while (m != clusterId[m])
0309               m = clusterId[m];
0310             clusterId[i] = m;
0311           }
0312         } else {
0313           more = false;
0314           for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
0315             auto p = hist.begin() + j;
0316             auto i = *p + firstPixel;
0317             for (int kk = 0; kk < nnn[k]; ++kk) {
0318               auto l = nn[k][kk];
0319               auto m = l + firstPixel;
0320               assert(m != i);
0321               auto old = atomicMin_block(&clusterId[m], clusterId[i]);
0322               // do we need memory fence?
0323               if (old != clusterId[i]) {
0324                 // end the loop only if no changes were applied
0325                 more = true;
0326               }
0327               atomicMin_block(&clusterId[i], old);
0328             }  // nnloop
0329           }    // pixel loop
0330         }
0331         ++nloops;
0332       }  // end while
0333 
0334 #ifdef GPU_DEBUG
0335       {
0336         __shared__ int n0;
0337         if (threadIdx.x == 0)
0338           n0 = nloops;
0339         __syncthreads();
0340         auto ok = n0 == nloops;
0341         assert(__syncthreads_and(ok));
0342         if (thisModuleId % 100 == 1)
0343           if (threadIdx.x == 0)
0344             printf("# loops %d\n", nloops);
0345       }
0346 #endif
0347 
0348       __shared__ unsigned int foundClusters;
0349       foundClusters = 0;
0350       __syncthreads();
0351 
0352       // find the number of different clusters, identified by a pixels with clus[i] == i;
0353       // mark these pixels with a negative id.
0354       for (int i = first; i < msize; i += blockDim.x) {
0355         if (id[i] == invalidModuleId)  // skip invalid pixels
0356           continue;
0357         if (clusterId[i] == i) {
0358           auto old = atomicInc(&foundClusters, 0xffffffff);
0359           clusterId[i] = -(old + 1);
0360         }
0361       }
0362       __syncthreads();
0363 
0364       // propagate the negative id to all the pixels in the cluster.
0365       for (int i = first; i < msize; i += blockDim.x) {
0366         if (id[i] == invalidModuleId)  // skip invalid pixels
0367           continue;
0368         if (clusterId[i] >= 0) {
0369           // mark each pixel in a cluster with the same id as the first one
0370           clusterId[i] = clusterId[clusterId[i]];
0371         }
0372       }
0373       __syncthreads();
0374 
0375       // adjust the cluster id to be a positive value starting from 0
0376       for (int i = first; i < msize; i += blockDim.x) {
0377         if (id[i] == invalidModuleId) {  // skip invalid pixels
0378           clusterId[i] = invalidClusterId;
0379           continue;
0380         }
0381         clusterId[i] = -clusterId[i] - 1;
0382       }
0383       __syncthreads();
0384 
0385       if (threadIdx.x == 0) {
0386         nClustersInModule[thisModuleId] = foundClusters;
0387         moduleId[module] = thisModuleId;
0388 #ifdef GPU_DEBUG
0389         if (foundClusters > gMaxHit) {
0390           gMaxHit = foundClusters;
0391           if (foundClusters > 8)
0392             printf("max hit %d in %d\n", foundClusters, thisModuleId);
0393         }
0394 #endif
0395 #ifdef GPU_DEBUG
0396         if (thisModuleId % 100 == 1)
0397           printf("%d clusters in module %d\n", foundClusters, thisModuleId);
0398 #endif
0399       }
0400     }  // module loop
0401   }
0402 }  // namespace gpuClustering
0403 
0404 #endif  // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h