PixelSeeding/plugins/CAHitNtupletGeneratorKernels.cu

0001 #include <mutex>
0002
0003 #include "CAHitNtupletGeneratorKernelsImpl.h"
0004
0005 //#define GPU_DEBUG
0006 //#define NTUPLE_DEBUG
0007
0008 template <typename TrackerTraits>
0009 void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::launchKernels(const HitsConstView &hh,
0010                                                                    TkSoAView &tracks_view,
0011                                                                    cudaStream_t cudaStream) {
0012   using namespace gpuPixelDoublets;
0013   using namespace caHitNtupletGeneratorKernels;
0014
0015   // zero tuples
0016   cms::cuda::launchZero(&(tracks_view.hitIndices()), cudaStream);  //TODO test .data()
0017
0018   int32_t nhits = hh.metadata().size();
0019
0020 #ifdef NTUPLE_DEBUG
0021   std::cout << "start tuple building. N hits " << nhits << std::endl;
0022   if (nhits < 2)
0023     std::cout << "too few hits " << nhits << std::endl;
0024 #endif
0025
0026   //
0027   // applying conbinatoric cleaning such as fishbone at this stage is too expensive
0028   //
0029
0030   auto nthTot = 64;
0031   auto stride = 4;
0032   auto blockSize = nthTot / stride;
0033   auto numberOfBlocks = this->nDoubletBlocks(blockSize);
0034   auto rescale = numberOfBlocks / 65536;
0035   blockSize *= (rescale + 1);
0036   numberOfBlocks = this->nDoubletBlocks(blockSize);
0037   assert(numberOfBlocks < 65536);
0038   assert(blockSize > 0 && 0 == blockSize % 16);
0039   dim3 blks(1, numberOfBlocks, 1);
0040   dim3 thrs(stride, blockSize, 1);
0041
0042   kernel_connect<TrackerTraits>
0043       <<<blks, thrs, 0, cudaStream>>>(this->device_hitTuple_apc_,
0044                                       this->device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
0045                                       hh,
0046                                       this->device_theCells_.get(),
0047                                       this->device_nCells_,
0048                                       this->device_theCellNeighbors_.get(),
0049                                       this->isOuterHitOfCell_,
0050                                       this->params_.caParams_);
0051
0052   cudaCheck(cudaGetLastError());
0053
0054   // do not run the fishbone if there are hits only in BPIX1
0055   if (nhits > this->isOuterHitOfCell_.offset && this->params_.earlyFishbone_) {
0056     auto nthTot = 128;
0057     auto stride = 16;
0058     auto blockSize = nthTot / stride;
0059     auto numberOfBlocks = (nhits - this->isOuterHitOfCell_.offset + blockSize - 1) / blockSize;
0060     dim3 blks(1, numberOfBlocks, 1);
0061     dim3 thrs(stride, blockSize, 1);
0062     fishbone<TrackerTraits><<<blks, thrs, 0, cudaStream>>>(
0063         hh, this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, false);
0064     cudaCheck(cudaGetLastError());
0065   }
0066
0067   blockSize = 64;
0068   numberOfBlocks = (3 * this->params_.caParams_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
0069   kernel_find_ntuplets<TrackerTraits><<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh,
0070                                                                                     tracks_view,
0071                                                                                     this->device_theCells_.get(),
0072                                                                                     this->device_nCells_,
0073                                                                                     this->device_theCellTracks_.get(),
0074                                                                                     this->device_hitTuple_apc_,
0075                                                                                     this->params_.caParams_);
0076 #ifdef GPU_DEBUG
0077   cudaDeviceSynchronize();
0078   cudaCheck(cudaGetLastError());
0079 #endif
0080   if (this->params_.doStats_)
0081     kernel_mark_used<TrackerTraits>
0082         <<<numberOfBlocks, blockSize, 0, cudaStream>>>(this->device_theCells_.get(), this->device_nCells_);
0083   cudaCheck(cudaGetLastError());
0084
0085 #ifdef GPU_DEBUG
0086   cudaDeviceSynchronize();
0087   cudaCheck(cudaGetLastError());
0088 #endif
0089
0090   blockSize = 128;
0091   numberOfBlocks = (HitContainer::ctNOnes() + blockSize - 1) / blockSize;
0092
0093   cms::cuda::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(this->device_hitTuple_apc_,
0094                                                                         &tracks_view.hitIndices());  //TODO test .data()
0095
0096 #ifdef GPU_DEBUG
0097   cudaDeviceSynchronize();
0098   cudaCheck(cudaGetLastError());
0099 #endif
0100
0101   kernel_fillHitDetIndices<TrackerTraits><<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, hh);
0102   cudaCheck(cudaGetLastError());
0103
0104 #ifdef GPU_DEBUG
0105   cudaDeviceSynchronize();
0106   cudaCheck(cudaGetLastError());
0107 #endif
0108   kernel_fillNLayers<TrackerTraits>
0109       <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, this->device_hitTuple_apc_);
0110   cudaCheck(cudaGetLastError());
0111
0112 #ifdef GPU_DEBUG
0113   cudaDeviceSynchronize();
0114   cudaCheck(cudaGetLastError());
0115 #endif
0116
0117   // remove duplicates (tracks that share a doublet)
0118   numberOfBlocks = this->nDoubletBlocks(blockSize);
0119
0120   kernel_earlyDuplicateRemover<TrackerTraits><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
0121       this->device_theCells_.get(), this->device_nCells_, tracks_view, this->params_.dupPassThrough_);
0122   cudaCheck(cudaGetLastError());
0123 #ifdef GPU_DEBUG
0124   cudaDeviceSynchronize();
0125   cudaCheck(cudaGetLastError());
0126 #endif
0127
0128   blockSize = 128;
0129   numberOfBlocks = (3 * TrackerTraits::maxNumberOfTuples / 4 + blockSize - 1) / blockSize;
0130   kernel_countMultiplicity<TrackerTraits>
0131       <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, this->device_tupleMultiplicity_.get());
0132   cms::cuda::launchFinalize(this->device_tupleMultiplicity_.get(), cudaStream);
0133   kernel_fillMultiplicity<TrackerTraits>
0134       <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, this->device_tupleMultiplicity_.get());
0135   cudaCheck(cudaGetLastError());
0136 #ifdef GPU_DEBUG
0137   cudaDeviceSynchronize();
0138   cudaCheck(cudaGetLastError());
0139 #endif
0140
0141   // do not run the fishbone if there are hits only in BPIX1
0142   if (nhits > this->isOuterHitOfCell_.offset && this->params_.lateFishbone_) {
0143     auto nthTot = 128;
0144     auto stride = 16;
0145     auto blockSize = nthTot / stride;
0146     auto numberOfBlocks = (nhits - this->isOuterHitOfCell_.offset + blockSize - 1) / blockSize;
0147     dim3 blks(1, numberOfBlocks, 1);
0148     dim3 thrs(stride, blockSize, 1);
0149     fishbone<TrackerTraits><<<blks, thrs, 0, cudaStream>>>(
0150         hh, this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, true);
0151     cudaCheck(cudaGetLastError());
0152   }
0153
0154 #ifdef GPU_DEBUG
0155   cudaDeviceSynchronize();
0156   cudaCheck(cudaGetLastError());
0157 #endif
0158 }
0159
0160 template <typename TrackerTraits>
0161 void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::buildDoublets(const HitsConstView &hh,
0162                                                                    int32_t offsetBPIX2,
0163                                                                    cudaStream_t stream) {
0164   int32_t nhits = hh.metadata().size();
0165   using namespace gpuPixelDoublets;
0166
0167   using GPUCACell = GPUCACellT<TrackerTraits>;
0168   using OuterHitOfCell = typename GPUCACell::OuterHitOfCell;
0169   using CellNeighbors = typename GPUCACell::CellNeighbors;
0170   using CellTracks = typename GPUCACell::CellTracks;
0171   using OuterHitOfCellContainer = typename GPUCACell::OuterHitOfCellContainer;
0172
0173   this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), offsetBPIX2};
0174
0175 #ifdef NTUPLE_DEBUG
0176   std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
0177 #endif
0178
0179 #ifdef GPU_DEBUG
0180   cudaDeviceSynchronize();
0181   cudaCheck(cudaGetLastError());
0182 #endif
0183
0184   // in principle we can use "nhits" to heuristically dimension the workspace...
0185   this->device_isOuterHitOfCell_ =
0186       cms::cuda::make_device_unique<OuterHitOfCellContainer[]>(std::max(1, nhits - offsetBPIX2), stream);
0187   assert(this->device_isOuterHitOfCell_.get());
0188
0189   this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), offsetBPIX2};
0190
0191   this->cellStorage_ =
0192       cms::cuda::make_device_unique<unsigned char[]>(TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors) +
0193                                                          TrackerTraits::maxNumOfActiveDoublets * sizeof(CellTracks),
0194                                                      stream);
0195   this->device_theCellNeighborsContainer_ = (CellNeighbors *)this->cellStorage_.get();
0196   this->device_theCellTracksContainer_ =
0197       (CellTracks *)(this->cellStorage_.get() + TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors));
0198
0199   {
0200     int threadsPerBlock = 128;
0201     // at least one block!
0202     int blocks = (std::max(1, nhits - offsetBPIX2) + threadsPerBlock - 1) / threadsPerBlock;
0203     initDoublets<TrackerTraits><<<blocks, threadsPerBlock, 0, stream>>>(this->isOuterHitOfCell_,
0204                                                                         nhits,
0205                                                                         this->device_theCellNeighbors_.get(),
0206                                                                         this->device_theCellNeighborsContainer_,
0207                                                                         this->device_theCellTracks_.get(),
0208                                                                         this->device_theCellTracksContainer_);
0209     cudaCheck(cudaGetLastError());
0210   }
0211
0212   this->device_theCells_ =
0213       cms::cuda::make_device_unique<GPUCACell[]>(this->params_.caParams_.maxNumberOfDoublets_, stream);
0214
0215 #ifdef GPU_DEBUG
0216   cudaDeviceSynchronize();
0217   cudaCheck(cudaGetLastError());
0218 #endif
0219
0220   if (0 == nhits)
0221     return;  // protect against empty events
0222
0223   // take all layer pairs into account
0224   auto nActualPairs = this->params_.nPairs();
0225
0226   int stride = 4;
0227   int threadsPerBlock = TrackerTraits::getDoubletsFromHistoMaxBlockSize / stride;
0228   int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock;
0229   dim3 blks(1, blocks, 1);
0230   dim3 thrs(stride, threadsPerBlock, 1);
0231
0232   getDoubletsFromHisto<TrackerTraits><<<blks, thrs, 0, stream>>>(this->device_theCells_.get(),
0233                                                                  this->device_nCells_,
0234                                                                  this->device_theCellNeighbors_.get(),
0235                                                                  this->device_theCellTracks_.get(),
0236                                                                  hh,
0237                                                                  this->isOuterHitOfCell_,
0238                                                                  nActualPairs,
0239                                                                  this->params_.caParams_.maxNumberOfDoublets_,
0240                                                                  this->device_cellCuts_.get());
0241   cudaCheck(cudaGetLastError());
0242
0243 #ifdef GPU_DEBUG
0244   cudaDeviceSynchronize();
0245   cudaCheck(cudaGetLastError());
0246 #endif
0247 }
0248
0249 template <typename TrackerTraits>
0250 void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::classifyTuples(const HitsConstView &hh,
0251                                                                     TkSoAView &tracks_view,
0252                                                                     cudaStream_t cudaStream) {
0253   using namespace caHitNtupletGeneratorKernels;
0254
0255   int32_t nhits = hh.metadata().size();
0256
0257   auto blockSize = 64;
0258
0259   // classify tracks based on kinematics
0260   auto numberOfBlocks = this->nQuadrupletBlocks(blockSize);
0261   kernel_classifyTracks<TrackerTraits>
0262       <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, this->params_.qualityCuts_);
0263
0264   if (this->params_.lateFishbone_) {
0265     // apply fishbone cleaning to good tracks
0266     numberOfBlocks = this->nDoubletBlocks(blockSize);
0267     kernel_fishboneCleaner<TrackerTraits>
0268         <<<numberOfBlocks, blockSize, 0, cudaStream>>>(this->device_theCells_.get(), this->device_nCells_, tracks_view);
0269     cudaCheck(cudaGetLastError());
0270   }
0271
0272   // mark duplicates (tracks that share a doublet)
0273   numberOfBlocks = this->nDoubletBlocks(blockSize);
0274   kernel_fastDuplicateRemover<TrackerTraits><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
0275       this->device_theCells_.get(), this->device_nCells_, tracks_view, this->params_.dupPassThrough_);
0276   cudaCheck(cudaGetLastError());
0277 #ifdef GPU_DEBUG
0278   cudaCheck(cudaDeviceSynchronize());
0279 #endif
0280
0281   if (this->params_.doSharedHitCut_ || this->params_.doStats_) {
0282     // fill hit->track "map"
0283     assert(this->hitToTupleView_.offSize > nhits);
0284     numberOfBlocks = this->nQuadrupletBlocks(blockSize);
0285     kernel_countHitInTracks<TrackerTraits>
0286         <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, this->device_hitToTuple_.get());  //CHECK
0287     cudaCheck(cudaGetLastError());
0288     assert((this->hitToTupleView_.assoc == this->device_hitToTuple_.get()) &&
0289            (this->hitToTupleView_.offStorage == this->device_hitToTupleStorage_.get()) &&
0290            (this->hitToTupleView_.offSize > 0));
0291     cms::cuda::launchFinalize(this->hitToTupleView_, cudaStream);
0292     cudaCheck(cudaGetLastError());
0293     kernel_fillHitInTracks<TrackerTraits>
0294         <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, this->device_hitToTuple_.get());
0295     cudaCheck(cudaGetLastError());
0296 #ifdef GPU_DEBUG
0297     cudaCheck(cudaDeviceSynchronize());
0298 #endif
0299   }
0300
0301   if (this->params_.doSharedHitCut_) {
0302     // mark duplicates (tracks that share at least one hit)
0303     numberOfBlocks = (this->hitToTupleView_.offSize + blockSize - 1) / blockSize;
0304
0305     kernel_rejectDuplicate<TrackerTraits><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
0306         tracks_view, this->params_.minHitsForSharingCut_, this->params_.dupPassThrough_, this->device_hitToTuple_.get());
0307
0308     kernel_sharedHitCleaner<TrackerTraits>
0309         <<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh,
0310                                                        tracks_view,
0311                                                        this->params_.minHitsForSharingCut_,
0312                                                        this->params_.dupPassThrough_,
0313                                                        this->device_hitToTuple_.get());
0314
0315     if (this->params_.useSimpleTripletCleaner_) {
0316       kernel_simpleTripletCleaner<TrackerTraits>
0317           <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view,
0318                                                          this->params_.minHitsForSharingCut_,
0319                                                          this->params_.dupPassThrough_,
0320                                                          this->device_hitToTuple_.get());
0321     } else {
0322       kernel_tripletCleaner<TrackerTraits>
0323           <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view,
0324                                                          this->params_.minHitsForSharingCut_,
0325                                                          this->params_.dupPassThrough_,
0326                                                          this->device_hitToTuple_.get());
0327     }
0328     cudaCheck(cudaGetLastError());
0329 #ifdef GPU_DEBUG
0330     cudaCheck(cudaDeviceSynchronize());
0331 #endif
0332   }
0333
0334   if (this->params_.doStats_) {
0335     numberOfBlocks = (std::max(nhits, int(this->params_.caParams_.maxNumberOfDoublets_)) + blockSize - 1) / blockSize;
0336     kernel_checkOverflows<TrackerTraits>
0337         <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view,
0338                                                        this->device_tupleMultiplicity_.get(),
0339                                                        this->device_hitToTuple_.get(),
0340                                                        this->device_hitTuple_apc_,
0341                                                        this->device_theCells_.get(),
0342                                                        this->device_nCells_,
0343                                                        this->device_theCellNeighbors_.get(),
0344                                                        this->device_theCellTracks_.get(),
0345                                                        this->isOuterHitOfCell_,
0346                                                        nhits,
0347                                                        this->params_.caParams_.maxNumberOfDoublets_,
0348                                                        this->counters_);
0349     cudaCheck(cudaGetLastError());
0350   }
0351
0352   if (this->params_.doStats_) {
0353     // counters (add flag???)
0354     numberOfBlocks = (this->hitToTupleView_.offSize + blockSize - 1) / blockSize;
0355     kernel_doStatsForHitInTracks<TrackerTraits>
0356         <<<numberOfBlocks, blockSize, 0, cudaStream>>>(this->device_hitToTuple_.get(), this->counters_);
0357     cudaCheck(cudaGetLastError());
0358     numberOfBlocks = (3 * TrackerTraits::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
0359     kernel_doStatsForTracks<TrackerTraits>
0360         <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, this->counters_);  //why sometimes yes and some no?
0361     cudaCheck(cudaGetLastError());
0362   }
0363 #ifdef GPU_DEBUG
0364   cudaDeviceSynchronize();
0365   cudaCheck(cudaGetLastError());
0366 #endif
0367
0368 #ifdef DUMP_GPU_TK_TUPLES
0369   static std::atomic<int> iev(0);
0370   static std::mutex lock;
0371   {
0372     std::lock_guard<std::mutex> guard(lock);
0373     ++iev;
0374     for (int k = 0; k < 20000; k += 500) {
0375       kernel_print_found_ntuplets<TrackerTraits>
0376           <<<1, 32, 0, cudaStream>>>(hh, tracks_view, this->device_hitToTuple_.get(), k, k + 500, iev);
0377       cudaCheck(cudaStreamSynchronize(cudaStream));
0378     }
0379     kernel_print_found_ntuplets<TrackerTraits>
0380         <<<1, 32, 0, cudaStream>>>(hh, tracks_view, this->device_hitToTuple_.get(), 20000, 1000000, iev);
0381     cudaCheck(cudaStreamSynchronize(cudaStream));
0382   }
0383 #endif
0384 }
0385
0386 template <typename TrackerTraits>
0387 void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::printCounters(Counters const *counters) {
0388   caHitNtupletGeneratorKernels::kernel_printCounters<<<1, 1>>>(counters);
0389 }
0390
0391 template class CAHitNtupletGeneratorKernelsGPU<pixelTopology::Phase1>;
0392 template class CAHitNtupletGeneratorKernelsGPU<pixelTopology::Phase2>;
0393 template class CAHitNtupletGeneratorKernelsGPU<pixelTopology::HIonPhase1>;