Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2025-02-14 03:16:54

0001 #ifndef RecoLocalTracker_SiPixelClusterizer_alpaka_PixelClustering_h
0002 #define RecoLocalTracker_SiPixelClusterizer_alpaka_PixelClustering_h
0003 
0004 #include <cmath>
0005 #include <cstdint>
0006 #include <cstdio>
0007 #include <type_traits>
0008 
0009 #include <alpaka/alpaka.hpp>
0010 
0011 #include "DataFormats/SiPixelClusterSoA/interface/ClusteringConstants.h"
0012 #include "FWCore/Utilities/interface/DeviceGlobal.h"
0013 #include "FWCore/Utilities/interface/HostDeviceConstant.h"
0014 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
0015 #include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
0016 #include "HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h"
0017 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0018 #include "HeterogeneousCore/AlpakaInterface/interface/warpsize.h"
0019 
0020 //#define GPU_DEBUG
0021 
0022 namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
0023 
0024 #ifdef GPU_DEBUG
0025   DEVICE_GLOBAL uint32_t gMaxHit = 0;
0026 #endif
0027 
0028   namespace pixelStatus {
0029     // Phase-1 pixel modules
0030     constexpr uint32_t pixelSizeX = pixelTopology::Phase1::numRowsInModule;
0031     constexpr uint32_t pixelSizeY = pixelTopology::Phase1::numColsInModule;
0032 
0033     // Use 0x00, 0x01, 0x03 so each can be OR'ed on top of the previous ones
0034     enum Status : uint32_t { kEmpty = 0x00, kFound = 0x01, kDuplicate = 0x03 };
0035 
0036     constexpr uint32_t bits = 2;
0037     constexpr uint32_t mask = (0x01 << bits) - 1;
0038     constexpr uint32_t valuesPerWord = sizeof(uint32_t) * 8 / bits;
0039     constexpr uint32_t size = pixelSizeX * pixelSizeY / valuesPerWord;
0040 
0041     ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getIndex(uint16_t x, uint16_t y) {
0042       return (pixelSizeX * y + x) / valuesPerWord;
0043     }
0044 
0045     ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getShift(uint16_t x, uint16_t y) {
0046       return (x % valuesPerWord) * 2;
0047     }
0048 
0049     ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr Status getStatus(uint32_t const* __restrict__ status,
0050                                                               uint16_t x,
0051                                                               uint16_t y) {
0052       uint32_t index = getIndex(x, y);
0053       uint32_t shift = getShift(x, y);
0054       return Status{(status[index] >> shift) & mask};
0055     }
0056 
0057     ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr bool isDuplicate(uint32_t const* __restrict__ status,
0058                                                               uint16_t x,
0059                                                               uint16_t y) {
0060       return getStatus(status, x, y) == kDuplicate;
0061     }
0062 
0063     /* FIXME
0064        * In the more general case (e.g. a multithreaded CPU backend) there is a potential race condition
0065        * between the read of status[index] at line NNN and the atomicCas at line NNN.
0066        * We should investigate:
0067        *   - if `status` should be read through a `volatile` pointer (CUDA/ROCm)
0068        *   - if `status` should be read with an atomic load (CPU)
0069        */
0070     ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr void promote(Acc1D const& acc,
0071                                                           uint32_t* __restrict__ status,
0072                                                           const uint16_t x,
0073                                                           const uint16_t y) {
0074       uint32_t index = getIndex(x, y);
0075       uint32_t shift = getShift(x, y);
0076       uint32_t old_word = status[index];
0077       uint32_t expected = old_word;
0078       do {
0079         expected = old_word;
0080         Status old_status{(old_word >> shift) & mask};
0081         if (kDuplicate == old_status) {
0082           // nothing to do
0083           return;
0084         }
0085         Status new_status = (kEmpty == old_status) ? kFound : kDuplicate;
0086         uint32_t new_word = old_word | (static_cast<uint32_t>(new_status) << shift);
0087         old_word = alpaka::atomicCas(acc, &status[index], expected, new_word, alpaka::hierarchy::Blocks{});
0088       } while (expected != old_word);
0089     }
0090 
0091   }  // namespace pixelStatus
0092 
0093   template <typename TrackerTraits>
0094   struct CountModules {
0095     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0096                                   SiPixelDigisSoAView digi_view,
0097                                   SiPixelClustersSoAView clus_view,
0098                                   const unsigned int numElements) const {
0099       // Make sure the atomicInc below does not overflow
0100       static_assert(TrackerTraits::numberOfModules < ::pixelClustering::maxNumModules);
0101 
0102 #ifdef GPU_DEBUG
0103       if (cms::alpakatools::once_per_grid(acc)) {
0104         printf("Starting to count modules to set module starts:");
0105       }
0106 #endif
0107       for (int32_t i : cms::alpakatools::uniform_elements(acc, numElements)) {
0108         digi_view[i].clus() = i;
0109         if (::pixelClustering::invalidModuleId == digi_view[i].moduleId())
0110           continue;
0111 
0112         int32_t j = i - 1;
0113         while (j >= 0 and digi_view[j].moduleId() == ::pixelClustering::invalidModuleId)
0114           --j;
0115         if (j < 0 or digi_view[j].moduleId() != digi_view[i].moduleId()) {
0116           // Found a module boundary: count the number of modules in  clus_view[0].moduleStart()
0117           auto loc = alpaka::atomicInc(acc,
0118                                        &clus_view[0].moduleStart(),
0119                                        static_cast<uint32_t>(::pixelClustering::maxNumModules),
0120                                        alpaka::hierarchy::Blocks{});
0121           ALPAKA_ASSERT_ACC(loc < TrackerTraits::numberOfModules);
0122 #ifdef GPU_DEBUG
0123           printf("> New module (no. %d) found at digi %d \n", loc, i);
0124 #endif
0125           clus_view[loc + 1].moduleStart() = i;
0126         }
0127       }
0128     }
0129   };
0130 
0131   template <typename TrackerTraits>
0132   struct FindClus {
0133     // assume that we can cover the whole module with up to 16 blockDimension-wide iterations
0134     static constexpr uint32_t maxIterGPU = 16;
0135 
0136     // this must be larger than maxPixInModule / maxIterGPU, and should be a multiple of the warp size
0137     static constexpr uint32_t maxElementsPerBlock =
0138         cms::alpakatools::round_up_by(TrackerTraits::maxPixInModule / maxIterGPU, 128);
0139 
0140     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0141                                   SiPixelDigisSoAView digi_view,
0142                                   SiPixelClustersSoAView clus_view,
0143                                   const unsigned int numElements) const {
0144       static_assert(TrackerTraits::numberOfModules < ::pixelClustering::maxNumModules);
0145 
0146       auto& lastPixel = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
0147 
0148       const uint32_t lastModule = clus_view[0].moduleStart();
0149       for (uint32_t module : cms::alpakatools::independent_groups(acc, lastModule)) {
0150         auto firstPixel = clus_view[1 + module].moduleStart();
0151         uint32_t thisModuleId = digi_view[firstPixel].moduleId();
0152         ALPAKA_ASSERT_ACC(thisModuleId < TrackerTraits::numberOfModules);
0153 
0154 #ifdef GPU_DEBUG
0155         if (thisModuleId % 100 == 1)
0156           if (cms::alpakatools::once_per_block(acc))
0157             printf("start clusterizer for module %4d in block %4d\n",
0158                    thisModuleId,
0159                    alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
0160 #endif
0161 
0162         // find the index of the first pixel not belonging to this module (or invalid)
0163         lastPixel = numElements;
0164         alpaka::syncBlockThreads(acc);
0165 
0166         // skip threads not associated to an existing pixel
0167         for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, numElements)) {
0168           auto id = digi_view[i].moduleId();
0169           // skip invalid pixels
0170           if (id == ::pixelClustering::invalidModuleId)
0171             continue;
0172           // find the first pixel in a different module
0173           if (id != thisModuleId) {
0174             alpaka::atomicMin(acc, &lastPixel, i, alpaka::hierarchy::Threads{});
0175             break;
0176           }
0177         }
0178 
0179         using Hist = cms::alpakatools::HistoContainer<uint16_t,
0180                                                       TrackerTraits::clusterBinning,
0181                                                       TrackerTraits::maxPixInModule,
0182                                                       TrackerTraits::clusterBits,
0183                                                       uint16_t>;
0184         constexpr int warpSize = cms::alpakatools::warpSize;
0185         auto& hist = alpaka::declareSharedVar<Hist, __COUNTER__>(acc);
0186         auto& ws = alpaka::declareSharedVar<typename Hist::Counter[warpSize], __COUNTER__>(acc);
0187         for (uint32_t j : cms::alpakatools::independent_group_elements(acc, Hist::totbins())) {
0188           hist.off[j] = 0;
0189         }
0190         alpaka::syncBlockThreads(acc);
0191 
0192         ALPAKA_ASSERT_ACC((lastPixel == numElements) or
0193                           ((lastPixel < numElements) and (digi_view[lastPixel].moduleId() != thisModuleId)));
0194         // limit to maxPixInModule  (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
0195         if (cms::alpakatools::once_per_block(acc)) {
0196           if (lastPixel - firstPixel > TrackerTraits::maxPixInModule) {
0197             printf("too many pixels in module %u: %u > %u\n",
0198                    thisModuleId,
0199                    lastPixel - firstPixel,
0200                    TrackerTraits::maxPixInModule);
0201             lastPixel = TrackerTraits::maxPixInModule + firstPixel;
0202           }
0203         }
0204         alpaka::syncBlockThreads(acc);
0205         ALPAKA_ASSERT_ACC(lastPixel - firstPixel <= TrackerTraits::maxPixInModule);
0206 
0207 #ifdef GPU_DEBUG
0208         auto& totGood = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
0209         totGood = 0;
0210         alpaka::syncBlockThreads(acc);
0211 #endif
0212 
0213         // remove duplicate pixels
0214         constexpr bool isPhase2 = std::is_base_of<pixelTopology::Phase2, TrackerTraits>::value;
0215         if constexpr (not isPhase2) {
0216           // packed words array used to store the pixelStatus of each pixel
0217           auto& status = alpaka::declareSharedVar<uint32_t[pixelStatus::size], __COUNTER__>(acc);
0218 
0219           if (lastPixel > 1) {
0220             for (uint32_t i : cms::alpakatools::independent_group_elements(acc, pixelStatus::size)) {
0221               status[i] = 0;
0222             }
0223             alpaka::syncBlockThreads(acc);
0224 
0225             for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel - 1)) {
0226               // skip invalid pixels
0227               if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
0228                 continue;
0229               pixelStatus::promote(acc, status, digi_view[i].xx(), digi_view[i].yy());
0230             }
0231             alpaka::syncBlockThreads(acc);
0232 
0233             for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel - 1)) {
0234               // skip invalid pixels
0235               if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
0236                 continue;
0237               if (pixelStatus::isDuplicate(status, digi_view[i].xx(), digi_view[i].yy())) {
0238                 digi_view[i].moduleId() = ::pixelClustering::invalidModuleId;
0239                 digi_view[i].rawIdArr() = 0;
0240               }
0241             }
0242             alpaka::syncBlockThreads(acc);
0243           }
0244         }
0245 
0246         // fill histo
0247         for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
0248           // skip invalid pixels
0249           if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) {
0250             hist.count(acc, digi_view[i].yy());
0251 #ifdef GPU_DEBUG
0252             alpaka::atomicAdd(acc, &totGood, 1u, alpaka::hierarchy::Blocks{});
0253 #endif
0254           }
0255         }
0256         alpaka::syncBlockThreads(acc);  // FIXME this can be removed
0257         for (uint32_t i : cms::alpakatools::independent_group_elements(acc, warpSize)) {
0258           ws[i] = 0;  // used by prefix scan...
0259         }
0260         alpaka::syncBlockThreads(acc);
0261         hist.finalize(acc, ws);
0262         alpaka::syncBlockThreads(acc);
0263 #ifdef GPU_DEBUG
0264         ALPAKA_ASSERT_ACC(hist.size() == totGood);
0265         if (thisModuleId % 100 == 1)
0266           if (cms::alpakatools::once_per_block(acc))
0267             printf("histo size %d\n", hist.size());
0268 #endif
0269         for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
0270           // skip invalid pixels
0271           if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) {
0272             hist.fill(acc, digi_view[i].yy(), i - firstPixel);
0273           }
0274         }
0275 
0276 #ifdef GPU_DEBUG
0277         // look for anomalous high occupancy
0278         auto& n40 = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
0279         auto& n60 = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
0280         if (cms::alpakatools::once_per_block(acc)) {
0281           n40 = 0;
0282           n60 = 0;
0283         }
0284         alpaka::syncBlockThreads(acc);
0285         for (uint32_t j : cms::alpakatools::independent_group_elements(acc, Hist::nbins())) {
0286           if (hist.size(j) > 60)
0287             alpaka::atomicAdd(acc, &n60, 1u, alpaka::hierarchy::Blocks{});
0288           if (hist.size(j) > 40)
0289             alpaka::atomicAdd(acc, &n40, 1u, alpaka::hierarchy::Blocks{});
0290         }
0291         alpaka::syncBlockThreads(acc);
0292         if (cms::alpakatools::once_per_block(acc)) {
0293           if (n60 > 0)
0294             printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
0295           else if (n40 > 0)
0296             printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
0297         }
0298         alpaka::syncBlockThreads(acc);
0299 #endif
0300 
0301         [[maybe_unused]] const uint32_t blockDimension = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0302         // assume that we can cover the whole module with up to maxIterGPU blockDimension-wide iterations
0303         ALPAKA_ASSERT_ACC((hist.size() / blockDimension) < maxIterGPU);
0304 
0305         // number of elements per thread
0306         constexpr uint32_t maxElements =
0307             cms::alpakatools::requires_single_thread_per_block_v<Acc1D> ? maxElementsPerBlock : 1;
0308         ALPAKA_ASSERT_ACC((alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u] <= maxElements));
0309 
0310         constexpr unsigned int maxIter = maxIterGPU * maxElements;
0311 
0312         // nearest neighbours (nn)
0313         // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
0314         constexpr int maxNeighbours = 10;
0315         uint16_t nn[maxIter][maxNeighbours];
0316         uint8_t nnn[maxIter];  // number of nn
0317         for (uint32_t k = 0; k < maxIter; ++k) {
0318           nnn[k] = 0;
0319         }
0320 
0321         alpaka::syncBlockThreads(acc);  // for hit filling!
0322 
0323         // fill the nearest neighbours
0324         uint32_t k = 0;
0325         for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
0326           ALPAKA_ASSERT_ACC(k < maxIter);
0327           auto p = hist.begin() + j;
0328           auto i = *p + firstPixel;
0329           ALPAKA_ASSERT_ACC(digi_view[i].moduleId() != ::pixelClustering::invalidModuleId);
0330           ALPAKA_ASSERT_ACC(digi_view[i].moduleId() == thisModuleId);  // same module
0331           auto bin = Hist::bin(digi_view[i].yy() + 1);
0332           auto end = hist.end(bin);
0333           ++p;
0334           ALPAKA_ASSERT_ACC(0 == nnn[k]);
0335           for (; p < end; ++p) {
0336             auto m = *p + firstPixel;
0337             ALPAKA_ASSERT_ACC(m != i);
0338             ALPAKA_ASSERT_ACC(int(digi_view[m].yy()) - int(digi_view[i].yy()) >= 0);
0339             ALPAKA_ASSERT_ACC(int(digi_view[m].yy()) - int(digi_view[i].yy()) <= 1);
0340             if (std::abs(int(digi_view[m].xx()) - int(digi_view[i].xx())) <= 1) {
0341               auto l = nnn[k]++;
0342               ALPAKA_ASSERT_ACC(l < maxNeighbours);
0343               nn[k][l] = *p;
0344             }
0345           }
0346           ++k;
0347         }
0348 
0349         // for each pixel, look at all the pixels until the end of the module;
0350         // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
0351         // after the loop, all the pixel in each cluster should have the id equeal to the lowest
0352         // pixel in the cluster ( clus[i] == i ).
0353         bool more = true;
0354         /*
0355           int nloops = 0;
0356           */
0357         while (alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, more)) {
0358           /*
0359             if (nloops % 2 == 0) {
0360               // even iterations of the outer loop
0361             */
0362           more = false;
0363           uint32_t k = 0;
0364           for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
0365             ALPAKA_ASSERT_ACC(k < maxIter);
0366             auto p = hist.begin() + j;
0367             auto i = *p + firstPixel;
0368             for (int kk = 0; kk < nnn[k]; ++kk) {
0369               auto l = nn[k][kk];
0370               auto m = l + firstPixel;
0371               ALPAKA_ASSERT_ACC(m != i);
0372               // FIXME ::Threads ?
0373               auto old = alpaka::atomicMin(acc, &digi_view[m].clus(), digi_view[i].clus(), alpaka::hierarchy::Blocks{});
0374               if (old != digi_view[i].clus()) {
0375                 // end the loop only if no changes were applied
0376                 more = true;
0377               }
0378               // FIXME ::Threads ?
0379               alpaka::atomicMin(acc, &digi_view[i].clus(), old, alpaka::hierarchy::Blocks{});
0380             }  // neighbours loop
0381             ++k;
0382           }  // pixel loop
0383           /*
0384               // use the outer loop to force a synchronisation
0385             } else {
0386               // odd iterations of the outer loop
0387             */
0388           alpaka::syncBlockThreads(acc);
0389           for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
0390             auto p = hist.begin() + j;
0391             auto i = *p + firstPixel;
0392             auto m = digi_view[i].clus();
0393             while (m != digi_view[m].clus())
0394               m = digi_view[m].clus();
0395             digi_view[i].clus() = m;
0396           }
0397           /*
0398             }
0399             ++nloops;
0400             */
0401         }  // end while
0402 
0403         /*
0404             // check that all threads in the block have executed the same number of iterations
0405             auto& n0 = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0406             if (cms::alpakatools::once_per_block(acc))
0407               n0 = nloops;
0408             alpaka::syncBlockThreads(acc);
0409             ALPAKA_ASSERT_ACC(alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, nloops == n0));
0410             if (thisModuleId % 100 == 1)
0411               if (cms::alpakatools::once_per_block(acc))
0412                 printf("# loops %d\n", nloops);
0413           */
0414 
0415         auto& foundClusters = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
0416         foundClusters = 0;
0417         alpaka::syncBlockThreads(acc);
0418 
0419         // find the number of different clusters, identified by a pixels with clus[i] == i;
0420         // mark these pixels with a negative id.
0421         for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
0422           // skip invalid pixels
0423           if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
0424             continue;
0425           if (digi_view[i].clus() == static_cast<int>(i)) {
0426             auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{});
0427             digi_view[i].clus() = -(old + 1);
0428           }
0429         }
0430         alpaka::syncBlockThreads(acc);
0431 
0432         // propagate the negative id to all the pixels in the cluster.
0433         for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
0434           // skip invalid pixels
0435           if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
0436             continue;
0437           if (digi_view[i].clus() >= 0) {
0438             // mark each pixel in a cluster with the same id as the first one
0439             digi_view[i].clus() = digi_view[digi_view[i].clus()].clus();
0440           }
0441         }
0442         alpaka::syncBlockThreads(acc);
0443 
0444         // adjust the cluster id to be a positive value starting from 0
0445         for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
0446           if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId) {
0447             // mark invalid pixels with an invalid cluster index
0448             digi_view[i].clus() = ::pixelClustering::invalidClusterId;
0449           } else {
0450             digi_view[i].clus() = -digi_view[i].clus() - 1;
0451           }
0452         }
0453         alpaka::syncBlockThreads(acc);
0454 
0455         if (cms::alpakatools::once_per_block(acc)) {
0456           clus_view[thisModuleId].clusInModule() = foundClusters;
0457           clus_view[module].moduleId() = thisModuleId;
0458 #ifdef GPU_DEBUG
0459           if (foundClusters > gMaxHit) {
0460             gMaxHit = foundClusters;
0461             if (foundClusters > 8)
0462               printf("max hit %d in %d\n", foundClusters, thisModuleId);
0463           }
0464           if (thisModuleId % 100 == 1)
0465             printf("%d clusters in module %d\n", foundClusters, thisModuleId);
0466 #endif
0467         }
0468       }  // module loop
0469     }
0470   };
0471 
0472 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering
0473 
0474 #endif  // plugin_SiPixelClusterizer_alpaka_PixelClustering.h