Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2022-04-04 00:15:50

0001 #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
0002 
0003 #include <mutex>
0004 
0005 namespace {
0006   // cuda atomics are NOT atomics on CPU so protect stat update with a mutex
0007   // waiting for a more general solution (incuding multiple devices) to be proposed and implemented
0008   std::mutex lock_stat;
0009 }  // namespace
0010 
0011 template <>
0012 void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) {
0013   kernel_printCounters(counters);
0014 }
0015 
0016 template <>
0017 void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
0018   auto nhits = hh.nHits();
0019 
0020 #ifdef NTUPLE_DEBUG
0021   std::cout << "building Doublets out of " << nhits << " Hits. BPIX2 offset is " << hh.offsetBPIX2() << std::endl;
0022 #endif
0023 
0024   // use "nhits" to heuristically dimension the workspace
0025 
0026   // no need to use the Traits allocations, since we know this is being compiled for the CPU
0027   //device_isOuterHitOfCell_ = Traits::template make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
0028   device_isOuterHitOfCell_ = std::make_unique<GPUCACell::OuterHitOfCellContainer[]>(std::max(1U, nhits));
0029   assert(device_isOuterHitOfCell_.get());
0030   isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};
0031 
0032   auto cellStorageSize = caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
0033                          caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks);
0034   // no need to use the Traits allocations, since we know this is being compiled for the CPU
0035   //cellStorage_ = Traits::template make_unique<unsigned char[]>(cellStorageSize, stream);
0036   cellStorage_ = std::make_unique<unsigned char[]>(cellStorageSize);
0037   device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
0038   device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets *
0039                                                                                       sizeof(GPUCACell::CellNeighbors));
0040 
0041   gpuPixelDoublets::initDoublets(isOuterHitOfCell_,
0042                                  nhits,
0043                                  device_theCellNeighbors_.get(),
0044                                  device_theCellNeighborsContainer_,
0045                                  device_theCellTracks_.get(),
0046                                  device_theCellTracksContainer_);
0047 
0048   // no need to use the Traits allocations, since we know this is being compiled for the CPU
0049   //device_theCells_ = Traits::template make_unique<GPUCACell[]>(params_.maxNumberOfDoublets_, stream);
0050   device_theCells_ = std::make_unique<GPUCACell[]>(params_.maxNumberOfDoublets_);
0051   if (0 == nhits)
0052     return;  // protect against empty events
0053 
0054   // take all layer pairs into account
0055   auto nActualPairs = gpuPixelDoublets::nPairs;
0056   if (not params_.includeJumpingForwardDoublets_) {
0057     // exclude forward "jumping" layer pairs
0058     nActualPairs = gpuPixelDoublets::nPairsForTriplets;
0059   }
0060   if (params_.minHitsPerNtuplet_ > 3) {
0061     // for quadruplets, exclude all "jumping" layer pairs
0062     nActualPairs = gpuPixelDoublets::nPairsForQuadruplets;
0063   }
0064 
0065   assert(nActualPairs <= gpuPixelDoublets::nPairs);
0066   gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(),
0067                                          device_nCells_,
0068                                          device_theCellNeighbors_.get(),
0069                                          device_theCellTracks_.get(),
0070                                          hh.view(),
0071                                          isOuterHitOfCell_,
0072                                          nActualPairs,
0073                                          params_.idealConditions_,
0074                                          params_.doClusterCut_,
0075                                          params_.doZ0Cut_,
0076                                          params_.doPtCut_,
0077                                          params_.maxNumberOfDoublets_);
0078 }
0079 
0080 template <>
0081 void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
0082   auto *tuples_d = &tracks_d->hitIndices;
0083   auto *detId_d = &tracks_d->detIndices;
0084   auto *quality_d = tracks_d->qualityData();
0085 
0086   assert(tuples_d && quality_d);
0087 
0088   // zero tuples
0089   cms::cuda::launchZero(tuples_d, cudaStream);
0090 
0091   auto nhits = hh.nHits();
0092 
0093   // std::cout << "N hits " << nhits << std::endl;
0094   // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
0095 
0096   //
0097   // applying conbinatoric cleaning such as fishbone at this stage is too expensive
0098   //
0099 
0100   kernel_connect(device_hitTuple_apc_,
0101                  device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
0102                  hh.view(),
0103                  device_theCells_.get(),
0104                  device_nCells_,
0105                  device_theCellNeighbors_.get(),
0106                  isOuterHitOfCell_,
0107                  params_.hardCurvCut_,
0108                  params_.ptmin_,
0109                  params_.CAThetaCutBarrel_,
0110                  params_.CAThetaCutForward_,
0111                  params_.dcaCutInnerTriplet_,
0112                  params_.dcaCutOuterTriplet_);
0113 
0114   if (nhits > 1 && params_.earlyFishbone_) {
0115     gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, false);
0116   }
0117 
0118   kernel_find_ntuplets(hh.view(),
0119                        device_theCells_.get(),
0120                        device_nCells_,
0121                        device_theCellTracks_.get(),
0122                        tuples_d,
0123                        device_hitTuple_apc_,
0124                        quality_d,
0125                        params_.minHitsPerNtuplet_);
0126   if (params_.doStats_)
0127     kernel_mark_used(device_theCells_.get(), device_nCells_);
0128 
0129   cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d);
0130 
0131   kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d);
0132   kernel_fillNLayers(tracks_d, device_hitTuple_apc_);
0133 
0134   // remove duplicates (tracks that share a doublet)
0135   kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_);
0136 
0137   kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
0138   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
0139   kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
0140 
0141   if (nhits > 1 && params_.lateFishbone_) {
0142     gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true);
0143   }
0144 }
0145 
0146 template <>
0147 void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
0148   int32_t nhits = hh.nHits();
0149 
0150   auto const *tuples_d = &tracks_d->hitIndices;
0151   auto *quality_d = tracks_d->qualityData();
0152 
0153   // classify tracks based on kinematics
0154   kernel_classifyTracks(tuples_d, tracks_d, params_.cuts_, quality_d);
0155 
0156   if (params_.lateFishbone_) {
0157     // apply fishbone cleaning to good tracks
0158     kernel_fishboneCleaner(device_theCells_.get(), device_nCells_, quality_d);
0159   }
0160 
0161   // remove duplicates (tracks that share a doublet)
0162   kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, params_.dupPassThrough_);
0163 
0164   // fill hit->track "map"
0165   if (params_.doSharedHitCut_ || params_.doStats_) {
0166     kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
0167     cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
0168     kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
0169   }
0170 
0171   // remove duplicates (tracks that share at least one hit)
0172   if (params_.doSharedHitCut_) {
0173     kernel_rejectDuplicate(
0174         tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
0175 
0176     kernel_sharedHitCleaner(hh.view(),
0177                             tracks_d,
0178                             quality_d,
0179                             params_.minHitsForSharingCut_,
0180                             params_.dupPassThrough_,
0181                             device_hitToTuple_.get());
0182     if (params_.useSimpleTripletCleaner_) {
0183       kernel_simpleTripletCleaner(
0184           tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
0185     } else {
0186       kernel_tripletCleaner(
0187           tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
0188     }
0189   }
0190 
0191   if (params_.doStats_) {
0192     std::lock_guard guard(lock_stat);
0193     kernel_checkOverflows(tuples_d,
0194                           device_tupleMultiplicity_.get(),
0195                           device_hitToTuple_.get(),
0196                           device_hitTuple_apc_,
0197                           device_theCells_.get(),
0198                           device_nCells_,
0199                           device_theCellNeighbors_.get(),
0200                           device_theCellTracks_.get(),
0201                           isOuterHitOfCell_,
0202                           nhits,
0203                           params_.maxNumberOfDoublets_,
0204                           counters_);
0205   }
0206 
0207   if (params_.doStats_) {
0208     // counters (add flag???)
0209     std::lock_guard guard(lock_stat);
0210     kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_);
0211     kernel_doStatsForTracks(tuples_d, quality_d, counters_);
0212   }
0213 
0214 #ifdef DUMP_GPU_TK_TUPLES
0215   static std::atomic<int> iev(0);
0216   static std::mutex lock;
0217   {
0218     std::lock_guard<std::mutex> guard(lock);
0219     ++iev;
0220     kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 0, 1000000, iev);
0221   }
0222 #endif
0223 }