File indexing completed on 2024-04-06 12:26:18
0001 #ifndef RecoLocalTracker_SiPixelClusterizer_alpaka_PixelClustering_h
0002 #define RecoLocalTracker_SiPixelClusterizer_alpaka_PixelClustering_h
0003
0004 #include <cmath>
0005 #include <cstdint>
0006 #include <cstdio>
0007 #include <type_traits>
0008
0009 #include <alpaka/alpaka.hpp>
0010
0011 #include "DataFormats/SiPixelClusterSoA/interface/ClusteringConstants.h"
0012 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
0013 #include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
0014 #include "HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h"
0015 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0016
0017
0018
0019 namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
0020
0021 #ifdef GPU_DEBUG
0022 template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0023 ALPAKA_STATIC_ACC_MEM_GLOBAL uint32_t gMaxHit = 0;
0024 #endif
0025
0026 namespace pixelStatus {
0027
0028 constexpr uint32_t pixelSizeX = pixelTopology::Phase1::numRowsInModule;
0029 constexpr uint32_t pixelSizeY = pixelTopology::Phase1::numColsInModule;
0030
0031
0032 enum Status : uint32_t { kEmpty = 0x00, kFound = 0x01, kDuplicate = 0x03 };
0033
0034 constexpr uint32_t bits = 2;
0035 constexpr uint32_t mask = (0x01 << bits) - 1;
0036 constexpr uint32_t valuesPerWord = sizeof(uint32_t) * 8 / bits;
0037 constexpr uint32_t size = pixelSizeX * pixelSizeY / valuesPerWord;
0038
0039 ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getIndex(uint16_t x, uint16_t y) {
0040 return (pixelSizeX * y + x) / valuesPerWord;
0041 }
0042
0043 ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getShift(uint16_t x, uint16_t y) {
0044 return (x % valuesPerWord) * 2;
0045 }
0046
0047 ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr Status getStatus(uint32_t const* __restrict__ status,
0048 uint16_t x,
0049 uint16_t y) {
0050 uint32_t index = getIndex(x, y);
0051 uint32_t shift = getShift(x, y);
0052 return Status{(status[index] >> shift) & mask};
0053 }
0054
0055 ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr bool isDuplicate(uint32_t const* __restrict__ status,
0056 uint16_t x,
0057 uint16_t y) {
0058 return getStatus(status, x, y) == kDuplicate;
0059 }
0060
0061
0062
0063
0064
0065
0066
0067
0068 template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0069 ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr void promote(TAcc const& acc,
0070 uint32_t* __restrict__ status,
0071 const uint16_t x,
0072 const uint16_t y) {
0073 uint32_t index = getIndex(x, y);
0074 uint32_t shift = getShift(x, y);
0075 uint32_t old_word = status[index];
0076 uint32_t expected = old_word;
0077 do {
0078 expected = old_word;
0079 Status old_status{(old_word >> shift) & mask};
0080 if (kDuplicate == old_status) {
0081
0082 return;
0083 }
0084 Status new_status = (kEmpty == old_status) ? kFound : kDuplicate;
0085 uint32_t new_word = old_word | (static_cast<uint32_t>(new_status) << shift);
0086 old_word = alpaka::atomicCas(acc, &status[index], expected, new_word, alpaka::hierarchy::Blocks{});
0087 } while (expected != old_word);
0088 }
0089
0090 }
0091
0092 template <typename TrackerTraits>
0093 struct CountModules {
0094 template <typename TAcc>
0095 ALPAKA_FN_ACC void operator()(const TAcc& acc,
0096 SiPixelDigisSoAView digi_view,
0097 SiPixelClustersSoAView clus_view,
0098 const unsigned int numElements) const {
0099
0100 static_assert(TrackerTraits::numberOfModules < ::pixelClustering::maxNumModules);
0101
0102 #ifdef GPU_DEBUG
0103 if (cms::alpakatools::once_per_grid(acc)) {
0104 printf("Starting to count modules to set module starts:");
0105 }
0106 #endif
0107 for (int32_t i : cms::alpakatools::uniform_elements(acc, numElements)) {
0108 digi_view[i].clus() = i;
0109 if (::pixelClustering::invalidModuleId == digi_view[i].moduleId())
0110 continue;
0111
0112 int32_t j = i - 1;
0113 while (j >= 0 and digi_view[j].moduleId() == ::pixelClustering::invalidModuleId)
0114 --j;
0115 if (j < 0 or digi_view[j].moduleId() != digi_view[i].moduleId()) {
0116
0117 auto loc = alpaka::atomicInc(acc,
0118 &clus_view[0].moduleStart(),
0119 static_cast<uint32_t>(::pixelClustering::maxNumModules),
0120 alpaka::hierarchy::Blocks{});
0121 ALPAKA_ASSERT_ACC(loc < TrackerTraits::numberOfModules);
0122 #ifdef GPU_DEBUG
0123 printf("> New module (no. %d) found at digi %d \n", loc, i);
0124 #endif
0125 clus_view[loc + 1].moduleStart() = i;
0126 }
0127 }
0128 }
0129 };
0130
0131 template <typename TrackerTraits>
0132 struct FindClus {
0133
0134 static constexpr uint32_t maxIterGPU = 16;
0135
0136
0137 static constexpr uint32_t maxElementsPerBlock =
0138 cms::alpakatools::round_up_by(TrackerTraits::maxPixInModule / maxIterGPU, 128);
0139
0140 template <typename TAcc>
0141 ALPAKA_FN_ACC void operator()(const TAcc& acc,
0142 SiPixelDigisSoAView digi_view,
0143 SiPixelClustersSoAView clus_view,
0144 const unsigned int numElements) const {
0145 static_assert(TrackerTraits::numberOfModules < ::pixelClustering::maxNumModules);
0146
0147 auto& lastPixel = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
0148
0149 const uint32_t lastModule = clus_view[0].moduleStart();
0150 for (uint32_t module : cms::alpakatools::independent_groups(acc, lastModule)) {
0151 auto firstPixel = clus_view[1 + module].moduleStart();
0152 uint32_t thisModuleId = digi_view[firstPixel].moduleId();
0153 ALPAKA_ASSERT_ACC(thisModuleId < TrackerTraits::numberOfModules);
0154
0155 #ifdef GPU_DEBUG
0156 if (thisModuleId % 100 == 1)
0157 if (cms::alpakatools::once_per_block(acc))
0158 printf("start clusterizer for module %4d in block %4d\n",
0159 thisModuleId,
0160 alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
0161 #endif
0162
0163
0164 lastPixel = numElements;
0165 alpaka::syncBlockThreads(acc);
0166
0167
0168 for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, numElements)) {
0169 auto id = digi_view[i].moduleId();
0170
0171 if (id == ::pixelClustering::invalidModuleId)
0172 continue;
0173
0174 if (id != thisModuleId) {
0175 alpaka::atomicMin(acc, &lastPixel, i, alpaka::hierarchy::Threads{});
0176 break;
0177 }
0178 }
0179
0180 using Hist = cms::alpakatools::HistoContainer<uint16_t,
0181 TrackerTraits::clusterBinning,
0182 TrackerTraits::maxPixInModule,
0183 TrackerTraits::clusterBits,
0184 uint16_t>;
0185 auto& hist = alpaka::declareSharedVar<Hist, __COUNTER__>(acc);
0186 auto& ws = alpaka::declareSharedVar<typename Hist::Counter[32], __COUNTER__>(acc);
0187 for (uint32_t j : cms::alpakatools::independent_group_elements(acc, Hist::totbins())) {
0188 hist.off[j] = 0;
0189 }
0190 alpaka::syncBlockThreads(acc);
0191
0192 ALPAKA_ASSERT_ACC((lastPixel == numElements) or
0193 ((lastPixel < numElements) and (digi_view[lastPixel].moduleId() != thisModuleId)));
0194
0195 if (cms::alpakatools::once_per_block(acc)) {
0196 if (lastPixel - firstPixel > TrackerTraits::maxPixInModule) {
0197 printf("too many pixels in module %u: %u > %u\n",
0198 thisModuleId,
0199 lastPixel - firstPixel,
0200 TrackerTraits::maxPixInModule);
0201 lastPixel = TrackerTraits::maxPixInModule + firstPixel;
0202 }
0203 }
0204 alpaka::syncBlockThreads(acc);
0205 ALPAKA_ASSERT_ACC(lastPixel - firstPixel <= TrackerTraits::maxPixInModule);
0206
0207 #ifdef GPU_DEBUG
0208 auto& totGood = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
0209 totGood = 0;
0210 alpaka::syncBlockThreads(acc);
0211 #endif
0212
0213
0214 constexpr bool isPhase2 = std::is_base_of<pixelTopology::Phase2, TrackerTraits>::value;
0215 if constexpr (not isPhase2) {
0216
0217 auto& status = alpaka::declareSharedVar<uint32_t[pixelStatus::size], __COUNTER__>(acc);
0218
0219 if (lastPixel > 1) {
0220 for (uint32_t i : cms::alpakatools::independent_group_elements(acc, pixelStatus::size)) {
0221 status[i] = 0;
0222 }
0223 alpaka::syncBlockThreads(acc);
0224
0225 for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel - 1)) {
0226
0227 if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
0228 continue;
0229 pixelStatus::promote(acc, status, digi_view[i].xx(), digi_view[i].yy());
0230 }
0231 alpaka::syncBlockThreads(acc);
0232
0233 for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel - 1)) {
0234
0235 if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
0236 continue;
0237 if (pixelStatus::isDuplicate(status, digi_view[i].xx(), digi_view[i].yy())) {
0238 digi_view[i].moduleId() = ::pixelClustering::invalidModuleId;
0239 digi_view[i].rawIdArr() = 0;
0240 }
0241 }
0242 alpaka::syncBlockThreads(acc);
0243 }
0244 }
0245
0246
0247 for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
0248
0249 if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) {
0250 hist.count(acc, digi_view[i].yy());
0251 #ifdef GPU_DEBUG
0252 alpaka::atomicAdd(acc, &totGood, 1u, alpaka::hierarchy::Blocks{});
0253 #endif
0254 }
0255 }
0256 alpaka::syncBlockThreads(acc);
0257 for (uint32_t i : cms::alpakatools::independent_group_elements(acc, 32u)) {
0258 ws[i] = 0;
0259 }
0260 alpaka::syncBlockThreads(acc);
0261 hist.finalize(acc, ws);
0262 alpaka::syncBlockThreads(acc);
0263 #ifdef GPU_DEBUG
0264 ALPAKA_ASSERT_ACC(hist.size() == totGood);
0265 if (thisModuleId % 100 == 1)
0266 if (cms::alpakatools::once_per_block(acc))
0267 printf("histo size %d\n", hist.size());
0268 #endif
0269 for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
0270
0271 if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) {
0272 hist.fill(acc, digi_view[i].yy(), i - firstPixel);
0273 }
0274 }
0275
0276 #ifdef GPU_DEBUG
0277
0278 auto& n40 = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
0279 auto& n60 = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
0280 if (cms::alpakatools::once_per_block(acc)) {
0281 n40 = 0;
0282 n60 = 0;
0283 }
0284 alpaka::syncBlockThreads(acc);
0285 for (uint32_t j : cms::alpakatools::independent_group_elements(acc, Hist::nbins())) {
0286 if (hist.size(j) > 60)
0287 alpaka::atomicAdd(acc, &n60, 1u, alpaka::hierarchy::Blocks{});
0288 if (hist.size(j) > 40)
0289 alpaka::atomicAdd(acc, &n40, 1u, alpaka::hierarchy::Blocks{});
0290 }
0291 alpaka::syncBlockThreads(acc);
0292 if (cms::alpakatools::once_per_block(acc)) {
0293 if (n60 > 0)
0294 printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
0295 else if (n40 > 0)
0296 printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
0297 }
0298 alpaka::syncBlockThreads(acc);
0299 #endif
0300
0301 [[maybe_unused]] const uint32_t blockDimension = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0302
0303 ALPAKA_ASSERT_ACC((hist.size() / blockDimension) < maxIterGPU);
0304
0305
0306 constexpr uint32_t maxElements =
0307 cms::alpakatools::requires_single_thread_per_block_v<TAcc> ? maxElementsPerBlock : 1;
0308 ALPAKA_ASSERT_ACC((alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u] <= maxElements));
0309
0310 constexpr unsigned int maxIter = maxIterGPU * maxElements;
0311
0312
0313
0314 constexpr int maxNeighbours = 10;
0315 uint16_t nn[maxIter][maxNeighbours];
0316 uint8_t nnn[maxIter];
0317 for (uint32_t k = 0; k < maxIter; ++k) {
0318 nnn[k] = 0;
0319 }
0320
0321 alpaka::syncBlockThreads(acc);
0322
0323
0324 uint32_t k = 0;
0325 for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
0326 ALPAKA_ASSERT_ACC(k < maxIter);
0327 auto p = hist.begin() + j;
0328 auto i = *p + firstPixel;
0329 ALPAKA_ASSERT_ACC(digi_view[i].moduleId() != ::pixelClustering::invalidModuleId);
0330 ALPAKA_ASSERT_ACC(digi_view[i].moduleId() == thisModuleId);
0331 auto bin = Hist::bin(digi_view[i].yy() + 1);
0332 auto end = hist.end(bin);
0333 ++p;
0334 ALPAKA_ASSERT_ACC(0 == nnn[k]);
0335 for (; p < end; ++p) {
0336 auto m = *p + firstPixel;
0337 ALPAKA_ASSERT_ACC(m != i);
0338 ALPAKA_ASSERT_ACC(int(digi_view[m].yy()) - int(digi_view[i].yy()) >= 0);
0339 ALPAKA_ASSERT_ACC(int(digi_view[m].yy()) - int(digi_view[i].yy()) <= 1);
0340 if (std::abs(int(digi_view[m].xx()) - int(digi_view[i].xx())) <= 1) {
0341 auto l = nnn[k]++;
0342 ALPAKA_ASSERT_ACC(l < maxNeighbours);
0343 nn[k][l] = *p;
0344 }
0345 }
0346 ++k;
0347 }
0348
0349
0350
0351
0352
0353 bool more = true;
0354
0355
0356
0357 while (alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, more)) {
0358
0359
0360
0361
0362 more = false;
0363 uint32_t k = 0;
0364 for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
0365 ALPAKA_ASSERT_ACC(k < maxIter);
0366 auto p = hist.begin() + j;
0367 auto i = *p + firstPixel;
0368 for (int kk = 0; kk < nnn[k]; ++kk) {
0369 auto l = nn[k][kk];
0370 auto m = l + firstPixel;
0371 ALPAKA_ASSERT_ACC(m != i);
0372
0373 auto old = alpaka::atomicMin(acc, &digi_view[m].clus(), digi_view[i].clus(), alpaka::hierarchy::Blocks{});
0374 if (old != digi_view[i].clus()) {
0375
0376 more = true;
0377 }
0378
0379 alpaka::atomicMin(acc, &digi_view[i].clus(), old, alpaka::hierarchy::Blocks{});
0380 }
0381 ++k;
0382 }
0383
0384
0385
0386
0387
0388 alpaka::syncBlockThreads(acc);
0389 for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
0390 auto p = hist.begin() + j;
0391 auto i = *p + firstPixel;
0392 auto m = digi_view[i].clus();
0393 while (m != digi_view[m].clus())
0394 m = digi_view[m].clus();
0395 digi_view[i].clus() = m;
0396 }
0397
0398
0399
0400
0401 }
0402
0403
0404
0405
0406
0407
0408
0409
0410
0411
0412
0413
0414
0415 auto& foundClusters = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
0416 foundClusters = 0;
0417 alpaka::syncBlockThreads(acc);
0418
0419
0420
0421 for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
0422
0423 if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
0424 continue;
0425 if (digi_view[i].clus() == static_cast<int>(i)) {
0426 auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{});
0427 digi_view[i].clus() = -(old + 1);
0428 }
0429 }
0430 alpaka::syncBlockThreads(acc);
0431
0432
0433 for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
0434
0435 if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
0436 continue;
0437 if (digi_view[i].clus() >= 0) {
0438
0439 digi_view[i].clus() = digi_view[digi_view[i].clus()].clus();
0440 }
0441 }
0442 alpaka::syncBlockThreads(acc);
0443
0444
0445 for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
0446 if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId) {
0447
0448 digi_view[i].clus() = ::pixelClustering::invalidClusterId;
0449 } else {
0450 digi_view[i].clus() = -digi_view[i].clus() - 1;
0451 }
0452 }
0453 alpaka::syncBlockThreads(acc);
0454
0455 if (cms::alpakatools::once_per_block(acc)) {
0456 clus_view[thisModuleId].clusInModule() = foundClusters;
0457 clus_view[module].moduleId() = thisModuleId;
0458 #ifdef GPU_DEBUG
0459 if (foundClusters > gMaxHit<TAcc>) {
0460 gMaxHit<TAcc> = foundClusters;
0461 if (foundClusters > 8)
0462 printf("max hit %d in %d\n", foundClusters, thisModuleId);
0463 }
0464 if (thisModuleId % 100 == 1)
0465 printf("%d clusters in module %d\n", foundClusters, thisModuleId);
0466 #endif
0467 }
0468 }
0469 }
0470 };
0471
0472 }
0473
0474 #endif