Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-11-14 04:15:52

0001 // C++ headers
0002 #ifdef DUMP_GPU_TK_TUPLES
0003 #include <mutex>
0004 #endif
0005 
0006 // Alpaka headers
0007 #include <alpaka/alpaka.hpp>
0008 
0009 // CMSSW headers
0010 #include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
0011 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0012 #include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
0013 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0014 
0015 // local headers
0016 #include "CAFishbone.h"
0017 #include "CAHitNtupletGeneratorKernels.h"
0018 #include "CAHitNtupletGeneratorKernelsImpl.h"
0019 
0020 //#define GPU_DEBUG
0021 //#define NTUPLE_DEBUG
0022 
0023 namespace ALPAKA_ACCELERATOR_NAMESPACE {
0024 
0025   template <typename TrackerTraits>
0026   CAHitNtupletGeneratorKernels<TrackerTraits>::CAHitNtupletGeneratorKernels(Params const &params,
0027                                                                             uint32_t nhits,
0028                                                                             uint32_t offsetBPIX2,
0029                                                                             Queue &queue)
0030       : m_params(params),
0031         //////////////////////////////////////////////////////////
0032         // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
0033         //////////////////////////////////////////////////////////
0034         counters_{cms::alpakatools::make_device_buffer<Counters>(queue)},
0035 
0036         // workspace
0037         device_hitToTuple_{cms::alpakatools::make_device_buffer<HitToTuple>(queue)},
0038         device_hitToTupleStorage_{
0039             cms::alpakatools::make_device_buffer<typename HitToTuple::Counter[]>(queue, nhits + 1)},
0040         device_tupleMultiplicity_{cms::alpakatools::make_device_buffer<TupleMultiplicity>(queue)},
0041 
0042         // NB: In legacy, device_theCells_ and device_isOuterHitOfCell_ were allocated inside buildDoublets
0043         device_theCells_{
0044             cms::alpakatools::make_device_buffer<CACell[]>(queue, m_params.caParams_.maxNumberOfDoublets_)},
0045         // in principle we can use "nhits" to heuristically dimension the workspace...
0046         device_isOuterHitOfCell_{
0047             cms::alpakatools::make_device_buffer<OuterHitOfCellContainer[]>(queue, std::max(1u, nhits - offsetBPIX2))},
0048         isOuterHitOfCell_{cms::alpakatools::make_device_buffer<OuterHitOfCell>(queue)},
0049 
0050         device_theCellNeighbors_{cms::alpakatools::make_device_buffer<CellNeighborsVector>(queue)},
0051         device_theCellTracks_{cms::alpakatools::make_device_buffer<CellTracksVector>(queue)},
0052         // NB: In legacy, cellStorage_ was allocated inside buildDoublets
0053         cellStorage_{cms::alpakatools::make_device_buffer<unsigned char[]>(
0054             queue,
0055             TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors) +
0056                 TrackerTraits::maxNumOfActiveDoublets * sizeof(CellTracks))},
0057         device_cellCuts_{cms::alpakatools::make_device_buffer<CellCuts>(queue)},
0058         device_theCellNeighborsContainer_{reinterpret_cast<CellNeighbors *>(cellStorage_.data())},
0059         device_theCellTracksContainer_{reinterpret_cast<CellTracks *>(
0060             cellStorage_.data() + TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors))},
0061 
0062         // NB: In legacy, device_storage_ was allocated inside allocateOnGPU
0063         device_storage_{
0064             cms::alpakatools::make_device_buffer<cms::alpakatools::AtomicPairCounter::DoubleWord[]>(queue, 3u)},
0065         device_hitTuple_apc_{reinterpret_cast<cms::alpakatools::AtomicPairCounter *>(device_storage_.data())},
0066         device_hitToTuple_apc_{reinterpret_cast<cms::alpakatools::AtomicPairCounter *>(device_storage_.data() + 1)},
0067         device_nCells_{
0068             cms::alpakatools::make_device_view(queue, *reinterpret_cast<uint32_t *>(device_storage_.data() + 2))} {
0069 #ifdef GPU_DEBUG
0070     std::cout << "Allocation for tuple building. N hits " << nhits << std::endl;
0071 #endif
0072 
0073     alpaka::memset(queue, counters_, 0);
0074     alpaka::memset(queue, device_nCells_, 0);
0075     alpaka::memset(queue, cellStorage_, 0);
0076 
0077     auto cellCuts_h = cms::alpakatools::make_host_view(m_params.cellCuts_);
0078     alpaka::memcpy(queue, device_cellCuts_, cellCuts_h);
0079 
0080     [[maybe_unused]] TupleMultiplicity *tupleMultiplicityDeviceData = device_tupleMultiplicity_.data();
0081     using TM = cms::alpakatools::OneToManyAssocRandomAccess<typename TrackerTraits::tindex_type,
0082                                                             TrackerTraits::maxHitsOnTrack + 1,
0083                                                             TrackerTraits::maxNumberOfTuples>;
0084     TM *tm = device_tupleMultiplicity_.data();
0085     TM::template launchZero<Acc1D>(tm, queue);
0086     TupleMultiplicity::template launchZero<Acc1D>(tupleMultiplicityDeviceData, queue);
0087 
0088     device_hitToTupleView_.assoc = device_hitToTuple_.data();
0089     device_hitToTupleView_.offStorage = device_hitToTupleStorage_.data();
0090     device_hitToTupleView_.offSize = nhits + 1;
0091 
0092     HitToTuple::template launchZero<Acc1D>(device_hitToTupleView_, queue);
0093 #ifdef GPU_DEBUG
0094     std::cout << "Allocations for CAHitNtupletGeneratorKernels: done!" << std::endl;
0095 #endif
0096   }
0097 
0098   template <typename TrackerTraits>
0099   void CAHitNtupletGeneratorKernels<TrackerTraits>::launchKernels(const HitsConstView &hh,
0100                                                                   uint32_t offsetBPIX2,
0101                                                                   TkSoAView &tracks_view,
0102                                                                   Queue &queue) {
0103     using namespace caPixelDoublets;
0104     using namespace caHitNtupletGeneratorKernels;
0105 
0106     // zero tuples
0107     HitContainer::template launchZero<Acc1D>(&(tracks_view.hitIndices()), queue);
0108 
0109     uint32_t nhits = hh.metadata().size();
0110 
0111 #ifdef NTUPLE_DEBUG
0112     std::cout << "start tuple building. N hits " << nhits << std::endl;
0113     if (nhits < 2)
0114       std::cout << "too few hits " << nhits << std::endl;
0115 #endif
0116 
0117     //
0118     // applying combinatoric cleaning such as fishbone at this stage is too expensive
0119     //
0120 
0121     const auto nthTot = 64;
0122     const auto stride = 4;
0123     auto blockSize = nthTot / stride;
0124     auto numberOfBlocks = cms::alpakatools::divide_up_by(3 * m_params.caParams_.maxNumberOfDoublets_ / 4, blockSize);
0125     const auto rescale = numberOfBlocks / 65536;
0126     blockSize *= (rescale + 1);
0127     numberOfBlocks = cms::alpakatools::divide_up_by(3 * m_params.caParams_.maxNumberOfDoublets_ / 4, blockSize);
0128     assert(numberOfBlocks < 65536);
0129     assert(blockSize > 0 && 0 == blockSize % 16);
0130     const Vec2D blks{numberOfBlocks, 1u};
0131     const Vec2D thrs{blockSize, stride};
0132     const auto kernelConnectWorkDiv = cms::alpakatools::make_workdiv<Acc2D>(blks, thrs);
0133 
0134     alpaka::exec<Acc2D>(queue,
0135                         kernelConnectWorkDiv,
0136                         Kernel_connect<TrackerTraits>{},
0137                         this->device_hitTuple_apc_,
0138                         this->device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
0139                         hh,
0140                         this->device_theCells_.data(),
0141                         this->device_nCells_.data(),
0142                         this->device_theCellNeighbors_.data(),
0143                         this->isOuterHitOfCell_.data(),
0144                         this->m_params.caParams_);
0145 
0146     // do not run the fishbone if there are hits only in BPIX1
0147     if (this->m_params.earlyFishbone_ and nhits > offsetBPIX2) {
0148       const auto nthTot = 128;
0149       const auto stride = 16;
0150       const auto blockSize = nthTot / stride;
0151       const auto numberOfBlocks = cms::alpakatools::divide_up_by(nhits - offsetBPIX2, blockSize);
0152       const Vec2D blks{numberOfBlocks, 1u};
0153       const Vec2D thrs{blockSize, stride};
0154       const auto fishboneWorkDiv = cms::alpakatools::make_workdiv<Acc2D>(blks, thrs);
0155       alpaka::exec<Acc2D>(queue,
0156                           fishboneWorkDiv,
0157                           CAFishbone<TrackerTraits>{},
0158                           hh,
0159                           this->device_theCells_.data(),
0160                           this->device_nCells_.data(),
0161                           this->isOuterHitOfCell_.data(),
0162                           nhits,
0163                           false);
0164     }
0165     blockSize = 64;
0166     numberOfBlocks = cms::alpakatools::divide_up_by(3 * m_params.caParams_.maxNumberOfDoublets_ / 4, blockSize);
0167     auto workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
0168     alpaka::exec<Acc1D>(queue,
0169                         workDiv1D,
0170                         Kernel_find_ntuplets<TrackerTraits>{},
0171                         hh,
0172                         tracks_view,
0173                         this->device_theCells_.data(),
0174                         this->device_nCells_.data(),
0175                         this->device_theCellTracks_.data(),
0176                         this->device_hitTuple_apc_,
0177                         this->m_params.caParams_);
0178 #ifdef GPU_DEBUG
0179     alpaka::wait(queue);
0180 #endif
0181 
0182     if (this->m_params.doStats_)
0183       alpaka::exec<Acc1D>(queue,
0184                           workDiv1D,
0185                           Kernel_mark_used<TrackerTraits>{},
0186                           this->device_theCells_.data(),
0187                           this->device_nCells_.data());
0188 
0189 #ifdef GPU_DEBUG
0190     alpaka::wait(queue);
0191 #endif
0192 
0193     blockSize = 128;
0194     numberOfBlocks = cms::alpakatools::divide_up_by(HitContainer{}.totOnes(), blockSize);
0195     workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
0196 
0197     alpaka::exec<Acc1D>(
0198         queue, workDiv1D, typename HitContainer::finalizeBulk{}, this->device_hitTuple_apc_, &tracks_view.hitIndices());
0199 
0200 #ifdef GPU_DEBUG
0201     alpaka::wait(queue);
0202 #endif
0203 
0204     alpaka::exec<Acc1D>(queue, workDiv1D, Kernel_fillHitDetIndices<TrackerTraits>{}, tracks_view, hh);
0205 
0206 #ifdef GPU_DEBUG
0207     alpaka::wait(queue);
0208 #endif
0209     alpaka::exec<Acc1D>(queue, workDiv1D, Kernel_fillNLayers<TrackerTraits>{}, tracks_view, this->device_hitTuple_apc_);
0210 
0211 #ifdef GPU_DEBUG
0212     alpaka::wait(queue);
0213 #endif
0214 
0215     // remove duplicates (tracks that share a doublet)
0216     numberOfBlocks = cms::alpakatools::divide_up_by(3 * m_params.caParams_.maxNumberOfDoublets_ / 4, blockSize);
0217     workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
0218 
0219     alpaka::exec<Acc1D>(queue,
0220                         workDiv1D,
0221                         Kernel_earlyDuplicateRemover<TrackerTraits>{},
0222                         this->device_theCells_.data(),
0223                         this->device_nCells_.data(),
0224                         tracks_view,
0225                         this->m_params.dupPassThrough_);
0226 #ifdef GPU_DEBUG
0227     alpaka::wait(queue);
0228 #endif
0229 
0230     blockSize = 128;
0231     numberOfBlocks = cms::alpakatools::divide_up_by(3 * TrackerTraits::maxNumberOfTuples / 4, blockSize);
0232     workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
0233 
0234     alpaka::exec<Acc1D>(queue,
0235                         workDiv1D,
0236                         Kernel_countMultiplicity<TrackerTraits>{},
0237                         tracks_view,
0238                         this->device_tupleMultiplicity_.data());
0239     TupleMultiplicity::template launchFinalize<Acc1D>(this->device_tupleMultiplicity_.data(), queue);
0240 
0241     workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
0242     alpaka::exec<Acc1D>(
0243         queue, workDiv1D, Kernel_fillMultiplicity<TrackerTraits>{}, tracks_view, this->device_tupleMultiplicity_.data());
0244 #ifdef GPU_DEBUG
0245     alpaka::wait(queue);
0246 #endif
0247     // do not run the fishbone if there are hits only in BPIX1
0248     if (this->m_params.lateFishbone_ and nhits > offsetBPIX2) {
0249       const auto nthTot = 128;
0250       const auto stride = 16;
0251       const auto blockSize = nthTot / stride;
0252       const auto numberOfBlocks = cms::alpakatools::divide_up_by(nhits - offsetBPIX2, blockSize);
0253       const Vec2D blks{numberOfBlocks, 1u};
0254       const Vec2D thrs{blockSize, stride};
0255       const auto workDiv2D = cms::alpakatools::make_workdiv<Acc2D>(blks, thrs);
0256 
0257       alpaka::exec<Acc2D>(queue,
0258                           workDiv2D,
0259                           CAFishbone<TrackerTraits>{},
0260                           hh,
0261                           this->device_theCells_.data(),
0262                           this->device_nCells_.data(),
0263                           this->isOuterHitOfCell_.data(),
0264                           nhits,
0265                           true);
0266     }
0267 
0268 #ifdef GPU_DEBUG
0269     alpaka::wait(queue);
0270 #endif
0271   }
0272 
0273   template <typename TrackerTraits>
0274   void CAHitNtupletGeneratorKernels<TrackerTraits>::buildDoublets(const HitsConstView &hh,
0275                                                                   uint32_t offsetBPIX2,
0276                                                                   Queue &queue) {
0277     using namespace caPixelDoublets;
0278     using CACell = CACellT<TrackerTraits>;
0279     using OuterHitOfCell = typename CACell::OuterHitOfCell;
0280     using CellNeighbors = typename CACell::CellNeighbors;
0281     using CellTracks = typename CACell::CellTracks;
0282     using OuterHitOfCellContainer = typename CACell::OuterHitOfCellContainer;
0283 
0284     auto nhits = hh.metadata().size();
0285 #ifdef NTUPLE_DEBUG
0286     std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
0287 #endif
0288 
0289 #ifdef GPU_DEBUG
0290     alpaka::wait(queue);
0291 #endif
0292 
0293     // in principle we can use "nhits" to heuristically dimension the workspace...
0294     ALPAKA_ASSERT_ACC(this->device_isOuterHitOfCell_.data());
0295 
0296     alpaka::exec<Acc1D>(
0297         queue,
0298         cms::alpakatools::make_workdiv<Acc1D>(1, 1),
0299         [] ALPAKA_FN_ACC(Acc1D const &acc,
0300                          OuterHitOfCell *isOuterHitOfCell,
0301                          OuterHitOfCellContainer *container,
0302                          int32_t const *offset) {
0303           // this code runs on the device
0304           isOuterHitOfCell->container = container;
0305           isOuterHitOfCell->offset = *offset;
0306         },
0307         this->isOuterHitOfCell_.data(),
0308         this->device_isOuterHitOfCell_.data(),
0309         &hh.offsetBPIX2());
0310 
0311     {
0312       int threadsPerBlock = 128;
0313       // at least one block!
0314       int blocks = std::max(1u, cms::alpakatools::divide_up_by(nhits - offsetBPIX2, threadsPerBlock));
0315       const auto workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(blocks, threadsPerBlock);
0316 
0317       alpaka::exec<Acc1D>(queue,
0318                           workDiv1D,
0319                           InitDoublets<TrackerTraits>{},
0320                           this->isOuterHitOfCell_.data(),
0321                           nhits,
0322                           this->device_theCellNeighbors_.data(),
0323                           this->device_theCellNeighborsContainer_,
0324                           this->device_theCellTracks_.data(),
0325                           this->device_theCellTracksContainer_);
0326     }
0327 
0328 #ifdef GPU_DEBUG
0329     alpaka::wait(queue);
0330 #endif
0331 
0332     if (0 == nhits)
0333       return;  // protect against empty events
0334 
0335     // take all layer pairs into account
0336     auto nActualPairs = this->m_params.nPairs();
0337 
0338     const int stride = 4;
0339     const int threadsPerBlock = TrackerTraits::getDoubletsFromHistoMaxBlockSize / stride;
0340     int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock;
0341     const Vec2D blks{blocks, 1u};
0342     const Vec2D thrs{threadsPerBlock, stride};
0343     const auto workDiv2D = cms::alpakatools::make_workdiv<Acc2D>(blks, thrs);
0344 
0345     alpaka::exec<Acc2D>(queue,
0346                         workDiv2D,
0347                         GetDoubletsFromHisto<TrackerTraits>{},
0348                         this->device_theCells_.data(),
0349                         this->device_nCells_.data(),
0350                         this->device_theCellNeighbors_.data(),
0351                         this->device_theCellTracks_.data(),
0352                         hh,
0353                         this->isOuterHitOfCell_.data(),
0354                         nActualPairs,
0355                         this->m_params.caParams_.maxNumberOfDoublets_,
0356                         this->m_params.cellCuts_);
0357 
0358 #ifdef GPU_DEBUG
0359     alpaka::wait(queue);
0360 #endif
0361   }
0362 
0363   template <typename TrackerTraits>
0364   void CAHitNtupletGeneratorKernels<TrackerTraits>::classifyTuples(const HitsConstView &hh,
0365                                                                    TkSoAView &tracks_view,
0366                                                                    Queue &queue) {
0367     using namespace caHitNtupletGeneratorKernels;
0368 
0369     uint32_t nhits = hh.metadata().size();
0370 
0371     auto blockSize = 64;
0372 
0373     // classify tracks based on kinematics
0374     auto numberOfBlocks = cms::alpakatools::divide_up_by(3 * TrackerTraits::maxNumberOfQuadruplets / 4, blockSize);
0375     auto workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
0376     alpaka::exec<Acc1D>(
0377         queue, workDiv1D, Kernel_classifyTracks<TrackerTraits>{}, tracks_view, this->m_params.qualityCuts_);
0378 
0379     if (this->m_params.lateFishbone_) {
0380       // apply fishbone cleaning to good tracks
0381       numberOfBlocks = cms::alpakatools::divide_up_by(3 * m_params.caParams_.maxNumberOfDoublets_ / 4, blockSize);
0382       workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
0383       alpaka::exec<Acc1D>(queue,
0384                           workDiv1D,
0385                           Kernel_fishboneCleaner<TrackerTraits>{},
0386                           this->device_theCells_.data(),
0387                           this->device_nCells_.data(),
0388                           tracks_view);
0389     }
0390 
0391     // mark duplicates (tracks that share a doublet)
0392     numberOfBlocks = cms::alpakatools::divide_up_by(3 * m_params.caParams_.maxNumberOfDoublets_ / 4, blockSize);
0393     workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
0394     alpaka::exec<Acc1D>(queue,
0395                         workDiv1D,
0396                         Kernel_fastDuplicateRemover<TrackerTraits>{},
0397                         this->device_theCells_.data(),
0398                         this->device_nCells_.data(),
0399                         tracks_view,
0400                         this->m_params.dupPassThrough_);
0401 #ifdef GPU_DEBUG
0402     alpaka::wait(queue);
0403 #endif
0404 
0405     if (this->m_params.doSharedHitCut_ || this->m_params.doStats_) {
0406       // fill hit->track "map"
0407       numberOfBlocks = cms::alpakatools::divide_up_by(3 * TrackerTraits::maxNumberOfQuadruplets / 4, blockSize);
0408       workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
0409       alpaka::exec<Acc1D>(queue,
0410                           workDiv1D,
0411                           Kernel_countHitInTracks<TrackerTraits>{},
0412                           tracks_view,
0413                           this->device_hitToTuple_.data());  //CHECK
0414 
0415       HitToTuple::template launchFinalize<Acc1D>(this->device_hitToTupleView_, queue);
0416       alpaka::exec<Acc1D>(
0417           queue, workDiv1D, Kernel_fillHitInTracks<TrackerTraits>{}, tracks_view, this->device_hitToTuple_.data());
0418 #ifdef GPU_DEBUG
0419       alpaka::wait(queue);
0420 #endif
0421     }
0422 
0423     if (this->m_params.doSharedHitCut_) {
0424       // mark duplicates (tracks that share at least one hit)
0425       numberOfBlocks = cms::alpakatools::divide_up_by(3 * TrackerTraits::maxNumberOfQuadruplets / 4,
0426                                                       blockSize);  // TODO: Check if correct
0427       workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
0428       alpaka::exec<Acc1D>(queue,
0429                           workDiv1D,
0430                           Kernel_rejectDuplicate<TrackerTraits>{},
0431                           tracks_view,
0432                           this->m_params.minHitsForSharingCut_,
0433                           this->m_params.dupPassThrough_,
0434                           this->device_hitToTuple_.data());
0435 
0436       alpaka::exec<Acc1D>(queue,
0437                           workDiv1D,
0438                           Kernel_sharedHitCleaner<TrackerTraits>{},
0439                           hh,
0440                           tracks_view,
0441                           this->m_params.minHitsForSharingCut_,
0442                           this->m_params.dupPassThrough_,
0443                           this->device_hitToTuple_.data());
0444 
0445       if (this->m_params.useSimpleTripletCleaner_) {
0446         // (typename HitToTuple{}::capacity(),
0447         numberOfBlocks = cms::alpakatools::divide_up_by(HitToTuple{}.capacity(), blockSize);
0448         workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
0449         alpaka::exec<Acc1D>(queue,
0450                             workDiv1D,
0451                             Kernel_simpleTripletCleaner<TrackerTraits>{},
0452                             tracks_view,
0453                             this->m_params.minHitsForSharingCut_,
0454                             this->m_params.dupPassThrough_,
0455                             this->device_hitToTuple_.data());
0456       } else {
0457         numberOfBlocks = cms::alpakatools::divide_up_by(HitToTuple{}.capacity(), blockSize);
0458         workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
0459         alpaka::exec<Acc1D>(queue,
0460                             workDiv1D,
0461                             Kernel_tripletCleaner<TrackerTraits>{},
0462                             tracks_view,
0463                             this->m_params.minHitsForSharingCut_,
0464                             this->m_params.dupPassThrough_,
0465                             this->device_hitToTuple_.data());
0466       }
0467 #ifdef GPU_DEBUG
0468       alpaka::wait(queue);
0469 #endif
0470     }
0471 
0472     if (this->m_params.doStats_) {
0473       numberOfBlocks =
0474           cms::alpakatools::divide_up_by(std::max(nhits, m_params.caParams_.maxNumberOfDoublets_), blockSize);
0475       workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
0476 
0477       alpaka::exec<Acc1D>(queue,
0478                           workDiv1D,
0479                           Kernel_checkOverflows<TrackerTraits>{},
0480                           tracks_view,
0481                           this->device_tupleMultiplicity_.data(),
0482                           this->device_hitToTuple_.data(),
0483                           this->device_hitTuple_apc_,
0484                           this->device_theCells_.data(),
0485                           this->device_nCells_.data(),
0486                           this->device_theCellNeighbors_.data(),
0487                           this->device_theCellTracks_.data(),
0488                           this->isOuterHitOfCell_.data(),
0489                           nhits,
0490                           this->m_params.caParams_.maxNumberOfDoublets_,
0491                           this->counters_.data());
0492     }
0493 
0494     if (this->m_params.doStats_) {
0495       // counters (add flag???)
0496 
0497       numberOfBlocks = cms::alpakatools::divide_up_by(HitToTuple{}.capacity(), blockSize);
0498       workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
0499       alpaka::exec<Acc1D>(queue,
0500                           workDiv1D,
0501                           Kernel_doStatsForHitInTracks<TrackerTraits>{},
0502                           this->device_hitToTuple_.data(),
0503                           this->counters_.data());
0504 
0505       numberOfBlocks = cms::alpakatools::divide_up_by(3 * TrackerTraits::maxNumberOfQuadruplets / 4, blockSize);
0506       workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
0507       alpaka::exec<Acc1D>(
0508           queue, workDiv1D, Kernel_doStatsForTracks<TrackerTraits>{}, tracks_view, this->counters_.data());
0509     }
0510 #ifdef GPU_DEBUG
0511     alpaka::wait(queue);
0512 #endif
0513 
0514 #ifdef DUMP_GPU_TK_TUPLES
0515     static std::atomic<int> iev(0);
0516     static std::mutex lock;
0517     workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(1u, 32u);
0518     {
0519       std::lock_guard<std::mutex> guard(lock);
0520       ++iev;
0521       for (int k = 0; k < 20000; k += 500) {
0522         alpaka::exec<Acc1D>(queue,
0523                             workDiv1D,
0524                             Kernel_print_found_ntuplets<TrackerTraits>{},
0525                             hh,
0526                             tracks_view,
0527                             this->device_hitToTuple_.data(),
0528                             k,
0529                             k + 500,
0530                             iev);
0531         alpaka::wait(queue);
0532       }
0533       alpaka::exec<Acc1D>(queue,
0534                           workDiv1D,
0535                           Kernel_print_found_ntuplets<TrackerTraits>{},
0536                           hh,
0537                           tracks_view,
0538                           this->device_hitToTuple_.data(),
0539                           20000,
0540                           1000000,
0541                           iev);
0542 
0543       alpaka::wait(queue);
0544     }
0545 #endif
0546   }
0547 
0548   /* This will make sense when we will be able to run this once per job in Alpaka
0549 
0550   template <typename TrackerTraits>
0551   void CAHitNtupletGeneratorKernels<TrackerTraits>::printCounters() {
0552     auto workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(1,1);
0553     alpaka::exec<Acc1D>(queue_, workDiv1D, Kernel_printCounters{}, this->counters_.data());
0554   }
0555   */
0556 
0557   template class CAHitNtupletGeneratorKernels<pixelTopology::Phase1>;
0558   template class CAHitNtupletGeneratorKernels<pixelTopology::Phase2>;
0559   template class CAHitNtupletGeneratorKernels<pixelTopology::HIonPhase1>;
0560 
0561 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE