Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 12:26:18

0001 #ifndef RecoLocalTracker_SiPixelClusterizer_alpaka_PixelClustering_h
0002 #define RecoLocalTracker_SiPixelClusterizer_alpaka_PixelClustering_h
0003 
0004 #include <cmath>
0005 #include <cstdint>
0006 #include <cstdio>
0007 #include <type_traits>
0008 
0009 #include <alpaka/alpaka.hpp>
0010 
0011 #include "DataFormats/SiPixelClusterSoA/interface/ClusteringConstants.h"
0012 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
0013 #include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
0014 #include "HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h"
0015 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0016 
0017 //#define GPU_DEBUG
0018 
0019 namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
0020 
0021 #ifdef GPU_DEBUG
0022   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0023   ALPAKA_STATIC_ACC_MEM_GLOBAL uint32_t gMaxHit = 0;
0024 #endif
0025 
0026   namespace pixelStatus {
0027     // Phase-1 pixel modules
0028     constexpr uint32_t pixelSizeX = pixelTopology::Phase1::numRowsInModule;
0029     constexpr uint32_t pixelSizeY = pixelTopology::Phase1::numColsInModule;
0030 
0031     // Use 0x00, 0x01, 0x03 so each can be OR'ed on top of the previous ones
0032     enum Status : uint32_t { kEmpty = 0x00, kFound = 0x01, kDuplicate = 0x03 };
0033 
0034     constexpr uint32_t bits = 2;
0035     constexpr uint32_t mask = (0x01 << bits) - 1;
0036     constexpr uint32_t valuesPerWord = sizeof(uint32_t) * 8 / bits;
0037     constexpr uint32_t size = pixelSizeX * pixelSizeY / valuesPerWord;
0038 
0039     ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getIndex(uint16_t x, uint16_t y) {
0040       return (pixelSizeX * y + x) / valuesPerWord;
0041     }
0042 
0043     ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getShift(uint16_t x, uint16_t y) {
0044       return (x % valuesPerWord) * 2;
0045     }
0046 
0047     ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr Status getStatus(uint32_t const* __restrict__ status,
0048                                                               uint16_t x,
0049                                                               uint16_t y) {
0050       uint32_t index = getIndex(x, y);
0051       uint32_t shift = getShift(x, y);
0052       return Status{(status[index] >> shift) & mask};
0053     }
0054 
0055     ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr bool isDuplicate(uint32_t const* __restrict__ status,
0056                                                               uint16_t x,
0057                                                               uint16_t y) {
0058       return getStatus(status, x, y) == kDuplicate;
0059     }
0060 
0061     /* FIXME
0062        * In the more general case (e.g. a multithreaded CPU backend) there is a potential race condition
0063        * between the read of status[index] at line NNN and the atomicCas at line NNN.
0064        * We should investigate:
0065        *   - if `status` should be read through a `volatile` pointer (CUDA/ROCm)
0066        *   - if `status` should be read with an atomic load (CPU)
0067        */
0068     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0069     ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr void promote(TAcc const& acc,
0070                                                           uint32_t* __restrict__ status,
0071                                                           const uint16_t x,
0072                                                           const uint16_t y) {
0073       uint32_t index = getIndex(x, y);
0074       uint32_t shift = getShift(x, y);
0075       uint32_t old_word = status[index];
0076       uint32_t expected = old_word;
0077       do {
0078         expected = old_word;
0079         Status old_status{(old_word >> shift) & mask};
0080         if (kDuplicate == old_status) {
0081           // nothing to do
0082           return;
0083         }
0084         Status new_status = (kEmpty == old_status) ? kFound : kDuplicate;
0085         uint32_t new_word = old_word | (static_cast<uint32_t>(new_status) << shift);
0086         old_word = alpaka::atomicCas(acc, &status[index], expected, new_word, alpaka::hierarchy::Blocks{});
0087       } while (expected != old_word);
0088     }
0089 
0090   }  // namespace pixelStatus
0091 
0092   template <typename TrackerTraits>
0093   struct CountModules {
0094     template <typename TAcc>
0095     ALPAKA_FN_ACC void operator()(const TAcc& acc,
0096                                   SiPixelDigisSoAView digi_view,
0097                                   SiPixelClustersSoAView clus_view,
0098                                   const unsigned int numElements) const {
0099       // Make sure the atomicInc below does not overflow
0100       static_assert(TrackerTraits::numberOfModules < ::pixelClustering::maxNumModules);
0101 
0102 #ifdef GPU_DEBUG
0103       if (cms::alpakatools::once_per_grid(acc)) {
0104         printf("Starting to count modules to set module starts:");
0105       }
0106 #endif
0107       for (int32_t i : cms::alpakatools::uniform_elements(acc, numElements)) {
0108         digi_view[i].clus() = i;
0109         if (::pixelClustering::invalidModuleId == digi_view[i].moduleId())
0110           continue;
0111 
0112         int32_t j = i - 1;
0113         while (j >= 0 and digi_view[j].moduleId() == ::pixelClustering::invalidModuleId)
0114           --j;
0115         if (j < 0 or digi_view[j].moduleId() != digi_view[i].moduleId()) {
0116           // Found a module boundary: count the number of modules in  clus_view[0].moduleStart()
0117           auto loc = alpaka::atomicInc(acc,
0118                                        &clus_view[0].moduleStart(),
0119                                        static_cast<uint32_t>(::pixelClustering::maxNumModules),
0120                                        alpaka::hierarchy::Blocks{});
0121           ALPAKA_ASSERT_ACC(loc < TrackerTraits::numberOfModules);
0122 #ifdef GPU_DEBUG
0123           printf("> New module (no. %d) found at digi %d \n", loc, i);
0124 #endif
0125           clus_view[loc + 1].moduleStart() = i;
0126         }
0127       }
0128     }
0129   };
0130 
0131   template <typename TrackerTraits>
0132   struct FindClus {
0133     // assume that we can cover the whole module with up to 16 blockDimension-wide iterations
0134     static constexpr uint32_t maxIterGPU = 16;
0135 
0136     // this must be larger than maxPixInModule / maxIterGPU, and should be a multiple of the warp size
0137     static constexpr uint32_t maxElementsPerBlock =
0138         cms::alpakatools::round_up_by(TrackerTraits::maxPixInModule / maxIterGPU, 128);
0139 
0140     template <typename TAcc>
0141     ALPAKA_FN_ACC void operator()(const TAcc& acc,
0142                                   SiPixelDigisSoAView digi_view,
0143                                   SiPixelClustersSoAView clus_view,
0144                                   const unsigned int numElements) const {
0145       static_assert(TrackerTraits::numberOfModules < ::pixelClustering::maxNumModules);
0146 
0147       auto& lastPixel = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
0148 
0149       const uint32_t lastModule = clus_view[0].moduleStart();
0150       for (uint32_t module : cms::alpakatools::independent_groups(acc, lastModule)) {
0151         auto firstPixel = clus_view[1 + module].moduleStart();
0152         uint32_t thisModuleId = digi_view[firstPixel].moduleId();
0153         ALPAKA_ASSERT_ACC(thisModuleId < TrackerTraits::numberOfModules);
0154 
0155 #ifdef GPU_DEBUG
0156         if (thisModuleId % 100 == 1)
0157           if (cms::alpakatools::once_per_block(acc))
0158             printf("start clusterizer for module %4d in block %4d\n",
0159                    thisModuleId,
0160                    alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
0161 #endif
0162 
0163         // find the index of the first pixel not belonging to this module (or invalid)
0164         lastPixel = numElements;
0165         alpaka::syncBlockThreads(acc);
0166 
0167         // skip threads not associated to an existing pixel
0168         for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, numElements)) {
0169           auto id = digi_view[i].moduleId();
0170           // skip invalid pixels
0171           if (id == ::pixelClustering::invalidModuleId)
0172             continue;
0173           // find the first pixel in a different module
0174           if (id != thisModuleId) {
0175             alpaka::atomicMin(acc, &lastPixel, i, alpaka::hierarchy::Threads{});
0176             break;
0177           }
0178         }
0179 
0180         using Hist = cms::alpakatools::HistoContainer<uint16_t,
0181                                                       TrackerTraits::clusterBinning,
0182                                                       TrackerTraits::maxPixInModule,
0183                                                       TrackerTraits::clusterBits,
0184                                                       uint16_t>;
0185         auto& hist = alpaka::declareSharedVar<Hist, __COUNTER__>(acc);
0186         auto& ws = alpaka::declareSharedVar<typename Hist::Counter[32], __COUNTER__>(acc);
0187         for (uint32_t j : cms::alpakatools::independent_group_elements(acc, Hist::totbins())) {
0188           hist.off[j] = 0;
0189         }
0190         alpaka::syncBlockThreads(acc);
0191 
0192         ALPAKA_ASSERT_ACC((lastPixel == numElements) or
0193                           ((lastPixel < numElements) and (digi_view[lastPixel].moduleId() != thisModuleId)));
0194         // limit to maxPixInModule  (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
0195         if (cms::alpakatools::once_per_block(acc)) {
0196           if (lastPixel - firstPixel > TrackerTraits::maxPixInModule) {
0197             printf("too many pixels in module %u: %u > %u\n",
0198                    thisModuleId,
0199                    lastPixel - firstPixel,
0200                    TrackerTraits::maxPixInModule);
0201             lastPixel = TrackerTraits::maxPixInModule + firstPixel;
0202           }
0203         }
0204         alpaka::syncBlockThreads(acc);
0205         ALPAKA_ASSERT_ACC(lastPixel - firstPixel <= TrackerTraits::maxPixInModule);
0206 
0207 #ifdef GPU_DEBUG
0208         auto& totGood = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
0209         totGood = 0;
0210         alpaka::syncBlockThreads(acc);
0211 #endif
0212 
0213         // remove duplicate pixels
0214         constexpr bool isPhase2 = std::is_base_of<pixelTopology::Phase2, TrackerTraits>::value;
0215         if constexpr (not isPhase2) {
0216           // packed words array used to store the pixelStatus of each pixel
0217           auto& status = alpaka::declareSharedVar<uint32_t[pixelStatus::size], __COUNTER__>(acc);
0218 
0219           if (lastPixel > 1) {
0220             for (uint32_t i : cms::alpakatools::independent_group_elements(acc, pixelStatus::size)) {
0221               status[i] = 0;
0222             }
0223             alpaka::syncBlockThreads(acc);
0224 
0225             for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel - 1)) {
0226               // skip invalid pixels
0227               if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
0228                 continue;
0229               pixelStatus::promote(acc, status, digi_view[i].xx(), digi_view[i].yy());
0230             }
0231             alpaka::syncBlockThreads(acc);
0232 
0233             for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel - 1)) {
0234               // skip invalid pixels
0235               if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
0236                 continue;
0237               if (pixelStatus::isDuplicate(status, digi_view[i].xx(), digi_view[i].yy())) {
0238                 digi_view[i].moduleId() = ::pixelClustering::invalidModuleId;
0239                 digi_view[i].rawIdArr() = 0;
0240               }
0241             }
0242             alpaka::syncBlockThreads(acc);
0243           }
0244         }
0245 
0246         // fill histo
0247         for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
0248           // skip invalid pixels
0249           if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) {
0250             hist.count(acc, digi_view[i].yy());
0251 #ifdef GPU_DEBUG
0252             alpaka::atomicAdd(acc, &totGood, 1u, alpaka::hierarchy::Blocks{});
0253 #endif
0254           }
0255         }
0256         alpaka::syncBlockThreads(acc);  // FIXME this can be removed
0257         for (uint32_t i : cms::alpakatools::independent_group_elements(acc, 32u)) {
0258           ws[i] = 0;  // used by prefix scan...
0259         }
0260         alpaka::syncBlockThreads(acc);
0261         hist.finalize(acc, ws);
0262         alpaka::syncBlockThreads(acc);
0263 #ifdef GPU_DEBUG
0264         ALPAKA_ASSERT_ACC(hist.size() == totGood);
0265         if (thisModuleId % 100 == 1)
0266           if (cms::alpakatools::once_per_block(acc))
0267             printf("histo size %d\n", hist.size());
0268 #endif
0269         for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
0270           // skip invalid pixels
0271           if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) {
0272             hist.fill(acc, digi_view[i].yy(), i - firstPixel);
0273           }
0274         }
0275 
0276 #ifdef GPU_DEBUG
0277         // look for anomalous high occupancy
0278         auto& n40 = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
0279         auto& n60 = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
0280         if (cms::alpakatools::once_per_block(acc)) {
0281           n40 = 0;
0282           n60 = 0;
0283         }
0284         alpaka::syncBlockThreads(acc);
0285         for (uint32_t j : cms::alpakatools::independent_group_elements(acc, Hist::nbins())) {
0286           if (hist.size(j) > 60)
0287             alpaka::atomicAdd(acc, &n60, 1u, alpaka::hierarchy::Blocks{});
0288           if (hist.size(j) > 40)
0289             alpaka::atomicAdd(acc, &n40, 1u, alpaka::hierarchy::Blocks{});
0290         }
0291         alpaka::syncBlockThreads(acc);
0292         if (cms::alpakatools::once_per_block(acc)) {
0293           if (n60 > 0)
0294             printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
0295           else if (n40 > 0)
0296             printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
0297         }
0298         alpaka::syncBlockThreads(acc);
0299 #endif
0300 
0301         [[maybe_unused]] const uint32_t blockDimension = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0302         // assume that we can cover the whole module with up to maxIterGPU blockDimension-wide iterations
0303         ALPAKA_ASSERT_ACC((hist.size() / blockDimension) < maxIterGPU);
0304 
0305         // number of elements per thread
0306         constexpr uint32_t maxElements =
0307             cms::alpakatools::requires_single_thread_per_block_v<TAcc> ? maxElementsPerBlock : 1;
0308         ALPAKA_ASSERT_ACC((alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u] <= maxElements));
0309 
0310         constexpr unsigned int maxIter = maxIterGPU * maxElements;
0311 
0312         // nearest neighbours (nn)
0313         // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
0314         constexpr int maxNeighbours = 10;
0315         uint16_t nn[maxIter][maxNeighbours];
0316         uint8_t nnn[maxIter];  // number of nn
0317         for (uint32_t k = 0; k < maxIter; ++k) {
0318           nnn[k] = 0;
0319         }
0320 
0321         alpaka::syncBlockThreads(acc);  // for hit filling!
0322 
0323         // fill the nearest neighbours
0324         uint32_t k = 0;
0325         for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
0326           ALPAKA_ASSERT_ACC(k < maxIter);
0327           auto p = hist.begin() + j;
0328           auto i = *p + firstPixel;
0329           ALPAKA_ASSERT_ACC(digi_view[i].moduleId() != ::pixelClustering::invalidModuleId);
0330           ALPAKA_ASSERT_ACC(digi_view[i].moduleId() == thisModuleId);  // same module
0331           auto bin = Hist::bin(digi_view[i].yy() + 1);
0332           auto end = hist.end(bin);
0333           ++p;
0334           ALPAKA_ASSERT_ACC(0 == nnn[k]);
0335           for (; p < end; ++p) {
0336             auto m = *p + firstPixel;
0337             ALPAKA_ASSERT_ACC(m != i);
0338             ALPAKA_ASSERT_ACC(int(digi_view[m].yy()) - int(digi_view[i].yy()) >= 0);
0339             ALPAKA_ASSERT_ACC(int(digi_view[m].yy()) - int(digi_view[i].yy()) <= 1);
0340             if (std::abs(int(digi_view[m].xx()) - int(digi_view[i].xx())) <= 1) {
0341               auto l = nnn[k]++;
0342               ALPAKA_ASSERT_ACC(l < maxNeighbours);
0343               nn[k][l] = *p;
0344             }
0345           }
0346           ++k;
0347         }
0348 
0349         // for each pixel, look at all the pixels until the end of the module;
0350         // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
0351         // after the loop, all the pixel in each cluster should have the id equeal to the lowest
0352         // pixel in the cluster ( clus[i] == i ).
0353         bool more = true;
0354         /*
0355           int nloops = 0;
0356           */
0357         while (alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, more)) {
0358           /*
0359             if (nloops % 2 == 0) {
0360               // even iterations of the outer loop
0361             */
0362           more = false;
0363           uint32_t k = 0;
0364           for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
0365             ALPAKA_ASSERT_ACC(k < maxIter);
0366             auto p = hist.begin() + j;
0367             auto i = *p + firstPixel;
0368             for (int kk = 0; kk < nnn[k]; ++kk) {
0369               auto l = nn[k][kk];
0370               auto m = l + firstPixel;
0371               ALPAKA_ASSERT_ACC(m != i);
0372               // FIXME ::Threads ?
0373               auto old = alpaka::atomicMin(acc, &digi_view[m].clus(), digi_view[i].clus(), alpaka::hierarchy::Blocks{});
0374               if (old != digi_view[i].clus()) {
0375                 // end the loop only if no changes were applied
0376                 more = true;
0377               }
0378               // FIXME ::Threads ?
0379               alpaka::atomicMin(acc, &digi_view[i].clus(), old, alpaka::hierarchy::Blocks{});
0380             }  // neighbours loop
0381             ++k;
0382           }  // pixel loop
0383              /*
0384               // use the outer loop to force a synchronisation
0385             } else {
0386               // odd iterations of the outer loop
0387             */
0388           alpaka::syncBlockThreads(acc);
0389           for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
0390             auto p = hist.begin() + j;
0391             auto i = *p + firstPixel;
0392             auto m = digi_view[i].clus();
0393             while (m != digi_view[m].clus())
0394               m = digi_view[m].clus();
0395             digi_view[i].clus() = m;
0396           }
0397           /*
0398             }
0399             ++nloops;
0400             */
0401         }  // end while
0402 
0403         /*
0404             // check that all threads in the block have executed the same number of iterations
0405             auto& n0 = alpaka::declareSharedVar<int, __COUNTER__>(acc);
0406             if (cms::alpakatools::once_per_block(acc))
0407               n0 = nloops;
0408             alpaka::syncBlockThreads(acc);
0409             ALPAKA_ASSERT_ACC(alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, nloops == n0));
0410             if (thisModuleId % 100 == 1)
0411               if (cms::alpakatools::once_per_block(acc))
0412                 printf("# loops %d\n", nloops);
0413           */
0414 
0415         auto& foundClusters = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
0416         foundClusters = 0;
0417         alpaka::syncBlockThreads(acc);
0418 
0419         // find the number of different clusters, identified by a pixels with clus[i] == i;
0420         // mark these pixels with a negative id.
0421         for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
0422           // skip invalid pixels
0423           if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
0424             continue;
0425           if (digi_view[i].clus() == static_cast<int>(i)) {
0426             auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{});
0427             digi_view[i].clus() = -(old + 1);
0428           }
0429         }
0430         alpaka::syncBlockThreads(acc);
0431 
0432         // propagate the negative id to all the pixels in the cluster.
0433         for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
0434           // skip invalid pixels
0435           if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
0436             continue;
0437           if (digi_view[i].clus() >= 0) {
0438             // mark each pixel in a cluster with the same id as the first one
0439             digi_view[i].clus() = digi_view[digi_view[i].clus()].clus();
0440           }
0441         }
0442         alpaka::syncBlockThreads(acc);
0443 
0444         // adjust the cluster id to be a positive value starting from 0
0445         for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
0446           if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId) {
0447             // mark invalid pixels with an invalid cluster index
0448             digi_view[i].clus() = ::pixelClustering::invalidClusterId;
0449           } else {
0450             digi_view[i].clus() = -digi_view[i].clus() - 1;
0451           }
0452         }
0453         alpaka::syncBlockThreads(acc);
0454 
0455         if (cms::alpakatools::once_per_block(acc)) {
0456           clus_view[thisModuleId].clusInModule() = foundClusters;
0457           clus_view[module].moduleId() = thisModuleId;
0458 #ifdef GPU_DEBUG
0459           if (foundClusters > gMaxHit<TAcc>) {
0460             gMaxHit<TAcc> = foundClusters;
0461             if (foundClusters > 8)
0462               printf("max hit %d in %d\n", foundClusters, thisModuleId);
0463           }
0464           if (thisModuleId % 100 == 1)
0465             printf("%d clusters in module %d\n", foundClusters, thisModuleId);
0466 #endif
0467         }
0468       }  // module loop
0469     }
0470   };
0471 
0472 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering
0473 
0474 #endif  // plugin_SiPixelClusterizer_alpaka_PixelClustering.h