Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-07-26 17:54:25

0001 //
0002 // Original Author: Felice Pantaleo, CERN
0003 //
0004 
0005 //#define NTUPLE_DEBUG
0006 //#define GPU_DEBUG
0007 
0008 #include <cmath>
0009 #include <cstdint>
0010 #include <limits>
0011 
0012 #include <cuda_runtime.h>
0013 
0014 #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
0015 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
0016 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0017 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0018 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
0019 
0020 #include "FWCore/Utilities/interface/isFinite.h"
0021 
0022 #include "CAHitNtupletGeneratorKernels.h"
0023 #include "CAStructures.h"
0024 #include "GPUCACell.h"
0025 #include "gpuFishbone.h"
0026 #include "gpuPixelDoublets.h"
0027 
0028 namespace caHitNtupletGeneratorKernels {
0029 
0030   constexpr uint32_t tkNotFound = std::numeric_limits<uint16_t>::max();
0031   constexpr float maxScore = std::numeric_limits<float>::max();
0032   constexpr float nSigma2 = 25.f;
0033 
0034   //all of these below are mostly to avoid brining around the relative namespace
0035 
0036   template <typename TrackerTraits>
0037   using HitToTuple = caStructures::HitToTupleT<TrackerTraits>;
0038 
0039   template <typename TrackerTraits>
0040   using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
0041 
0042   template <typename TrackerTraits>
0043   using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
0044 
0045   template <typename TrackerTraits>
0046   using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
0047 
0048   template <typename TrackerTraits>
0049   using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
0050 
0051   using Quality = pixelTrack::Quality;
0052 
0053   template <typename TrackerTraits>
0054   using TkSoAView = TrackSoAView<TrackerTraits>;
0055 
0056   template <typename TrackerTraits>
0057   using HitContainer = typename TrackSoA<TrackerTraits>::HitContainer;
0058 
0059   template <typename TrackerTraits>
0060   using HitsConstView = typename GPUCACellT<TrackerTraits>::HitsConstView;
0061 
0062   template <typename TrackerTraits>
0063   using QualityCuts = pixelTrack::QualityCutsT<TrackerTraits>;
0064 
0065   template <typename TrackerTraits>
0066   using CAParams = caHitNtupletGenerator::CAParamsT<TrackerTraits>;
0067 
0068   using Counters = caHitNtupletGenerator::Counters;
0069 
0070   template <typename TrackerTraits>
0071   __global__ void kernel_checkOverflows(TkSoAView<TrackerTraits> tracks_view,
0072                                         TupleMultiplicity<TrackerTraits> const *tupleMultiplicity,
0073                                         HitToTuple<TrackerTraits> const *hitToTuple,
0074                                         cms::cuda::AtomicPairCounter *apc,
0075                                         GPUCACellT<TrackerTraits> const *__restrict__ cells,
0076                                         uint32_t const *__restrict__ nCells,
0077                                         CellNeighborsVector<TrackerTraits> const *cellNeighbors,
0078                                         CellTracksVector<TrackerTraits> const *cellTracks,
0079                                         OuterHitOfCell<TrackerTraits> const isOuterHitOfCell,
0080                                         int32_t nHits,
0081                                         uint32_t maxNumberOfDoublets,
0082                                         Counters *counters) {
0083     auto first = threadIdx.x + blockIdx.x * blockDim.x;
0084 
0085     auto &c = *counters;
0086     // counters once per event
0087     if (0 == first) {
0088       atomicAdd(&c.nEvents, 1);
0089       atomicAdd(&c.nHits, nHits);
0090       atomicAdd(&c.nCells, *nCells);
0091       atomicAdd(&c.nTuples, apc->get().m);
0092       atomicAdd(&c.nFitTracks, tupleMultiplicity->size());
0093     }
0094 
0095 #ifdef NTUPLE_DEBUG
0096     if (0 == first) {
0097       printf("number of found cells %d \n found tuples %d with total hits %d out of %d %d\n",
0098              *nCells,
0099              apc->get().m,
0100              apc->get().n,
0101              nHits,
0102              hitToTuple->totOnes());
0103       if (apc->get().m < TrackerTraits::maxNumberOfQuadruplets) {
0104         assert(tracks_view.hitIndices().size(apc->get().m) == 0);
0105         assert(tracks_view.hitIndices().size() == apc->get().n);
0106       }
0107     }
0108 
0109     for (int idx = first, nt = tracks_view.hitIndices().nOnes(); idx < nt; idx += gridDim.x * blockDim.x) {
0110       if (tracks_view.hitIndices().size(idx) > TrackerTraits::maxHitsOnTrack)  // current real limit
0111         printf("ERROR %d, %d\n", idx, tracks_view.hitIndices().size(idx));
0112       assert(tracks_view.hitIndices().size(idx) <= TrackerTraits::maxHitsOnTrack);
0113       for (auto ih = tracks_view.hitIndices().begin(idx); ih != tracks_view.hitIndices().end(idx); ++ih)
0114         assert(int(*ih) < nHits);
0115     }
0116 #endif
0117 
0118     if (0 == first) {
0119       if (apc->get().m >= TrackerTraits::maxNumberOfQuadruplets)
0120         printf("Tuples overflow\n");
0121       if (*nCells >= maxNumberOfDoublets)
0122         printf("Cells overflow\n");
0123       if (cellNeighbors && cellNeighbors->full())
0124         printf("cellNeighbors overflow %d %d \n", cellNeighbors->capacity(), cellNeighbors->size());
0125       if (cellTracks && cellTracks->full())
0126         printf("cellTracks overflow\n");
0127       if (int(hitToTuple->nOnes()) < nHits)
0128         printf("ERROR hitToTuple  overflow %d %d\n", hitToTuple->nOnes(), nHits);
0129 #ifdef GPU_DEBUG
0130       printf("size of cellNeighbors %d \n cellTracks %d \n hitToTuple %d \n",
0131              cellNeighbors->size(),
0132              cellTracks->size(),
0133              hitToTuple->size());
0134 
0135 #endif
0136     }
0137 
0138     for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0139       auto const &thisCell = cells[idx];
0140       if (thisCell.hasFishbone() && !thisCell.isKilled())
0141         atomicAdd(&c.nFishCells, 1);
0142       if (thisCell.outerNeighbors().full())  //++tooManyNeighbors[thisCell.theLayerPairId];
0143         printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId());
0144       if (thisCell.tracks().full())  //++tooManyTracks[thisCell.theLayerPairId];
0145         printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId());
0146       if (thisCell.isKilled())
0147         atomicAdd(&c.nKilledCells, 1);
0148       if (!thisCell.unused())
0149         atomicAdd(&c.nEmptyCells, 1);
0150       if ((0 == hitToTuple->size(thisCell.inner_hit_id())) && (0 == hitToTuple->size(thisCell.outer_hit_id())))
0151         atomicAdd(&c.nZeroTrackCells, 1);
0152     }
0153 
0154     for (int idx = first, nt = nHits - isOuterHitOfCell.offset; idx < nt; idx += gridDim.x * blockDim.x) {
0155       if (isOuterHitOfCell.container[idx].full())  // ++tooManyOuterHitOfCell;
0156         printf("OuterHitOfCell overflow %d\n", idx);
0157     }
0158   }
0159 
0160   template <typename TrackerTraits>
0161   __global__ void kernel_fishboneCleaner(GPUCACellT<TrackerTraits> const *cells,
0162                                          uint32_t const *__restrict__ nCells,
0163                                          TkSoAView<TrackerTraits> tracks_view) {
0164     constexpr auto reject = pixelTrack::Quality::dup;
0165 
0166     auto first = threadIdx.x + blockIdx.x * blockDim.x;
0167     for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0168       auto const &thisCell = cells[idx];
0169       if (!thisCell.isKilled())
0170         continue;
0171 
0172       for (auto it : thisCell.tracks())
0173         tracks_view[it].quality() = reject;
0174     }
0175   }
0176 
0177   // remove shorter tracks if sharing a cell
0178   // It does not seem to affect efficiency in any way!
0179   template <typename TrackerTraits>
0180   __global__ void kernel_earlyDuplicateRemover(GPUCACellT<TrackerTraits> const *cells,
0181                                                uint32_t const *__restrict__ nCells,
0182                                                TkSoAView<TrackerTraits> tracks_view,
0183                                                bool dupPassThrough) {
0184     // quality to mark rejected
0185     constexpr auto reject = pixelTrack::Quality::edup;  /// cannot be loose
0186 
0187     assert(nCells);
0188     auto first = threadIdx.x + blockIdx.x * blockDim.x;
0189     for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0190       auto const &thisCell = cells[idx];
0191 
0192       if (thisCell.tracks().size() < 2)
0193         continue;
0194 
0195       int8_t maxNl = 0;
0196 
0197       // find maxNl
0198       for (auto it : thisCell.tracks()) {
0199         auto nl = tracks_view[it].nLayers();
0200         maxNl = std::max(nl, maxNl);
0201       }
0202 
0203       // if (maxNl<4) continue;
0204       // quad pass through (leave it her for tests)
0205       //  maxNl = std::min(4, maxNl);
0206 
0207       for (auto it : thisCell.tracks()) {
0208         if (tracks_view[it].nLayers() < maxNl)
0209           tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
0210       }
0211     }
0212   }
0213 
0214   // assume the above (so, short tracks already removed)
0215   template <typename TrackerTraits>
0216   __global__ void kernel_fastDuplicateRemover(GPUCACellT<TrackerTraits> const *__restrict__ cells,
0217                                               uint32_t const *__restrict__ nCells,
0218                                               TkSoAView<TrackerTraits> tracks_view,
0219                                               bool dupPassThrough) {
0220     // quality to mark rejected
0221     auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
0222     constexpr auto loose = pixelTrack::Quality::loose;
0223 
0224     assert(nCells);
0225 
0226     auto first = threadIdx.x + blockIdx.x * blockDim.x;
0227     for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0228       auto const &thisCell = cells[idx];
0229       if (thisCell.tracks().size() < 2)
0230         continue;
0231 
0232       float mc = maxScore;
0233       uint16_t im = tkNotFound;
0234 
0235       auto score = [&](auto it) { return std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it)); };
0236 
0237       // full crazy combinatorics
0238       // full crazy combinatorics
0239       int ntr = thisCell.tracks().size();
0240       for (int i = 0; i < ntr - 1; ++i) {
0241         auto it = thisCell.tracks()[i];
0242         auto qi = tracks_view[it].quality();
0243         if (qi <= reject)
0244           continue;
0245         auto opi = tracks_view[it].state()(2);
0246         auto e2opi = tracks_view[it].covariance()(9);
0247         auto cti = tracks_view[it].state()(3);
0248         auto e2cti = tracks_view[it].covariance()(12);
0249         for (auto j = i + 1; j < ntr; ++j) {
0250           auto jt = thisCell.tracks()[j];
0251           auto qj = tracks_view[jt].quality();
0252           if (qj <= reject)
0253             continue;
0254           auto opj = tracks_view[jt].state()(2);
0255           auto ctj = tracks_view[jt].state()(3);
0256           auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti);
0257           if ((cti - ctj) * (cti - ctj) > dct)
0258             continue;
0259           auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi);
0260           if ((opi - opj) * (opi - opj) > dop)
0261             continue;
0262           if ((qj < qi) || (qj == qi && score(it) < score(jt)))
0263             tracks_view[jt].quality() = reject;
0264           else {
0265             tracks_view[it].quality() = reject;
0266             break;
0267           }
0268         }
0269       }
0270 
0271       // find maxQual
0272       auto maxQual = reject;  // no duplicate!
0273       for (auto it : thisCell.tracks()) {
0274         if (tracks_view[it].quality() > maxQual)
0275           maxQual = tracks_view[it].quality();
0276       }
0277 
0278       if (maxQual <= loose)
0279         continue;
0280 
0281       // find min score
0282       for (auto it : thisCell.tracks()) {
0283         if (tracks_view[it].quality() == maxQual && score(it) < mc) {
0284           mc = score(it);
0285           im = it;
0286         }
0287       }
0288 
0289       if (tkNotFound == im)
0290         continue;
0291 
0292       // mark all other duplicates  (not yet, keep it loose)
0293       for (auto it : thisCell.tracks()) {
0294         if (tracks_view[it].quality() > loose && it != im)
0295           tracks_view[it].quality() = loose;  //no race:  simple assignment of the same constant
0296       }
0297     }
0298   }
0299 
0300   template <typename TrackerTraits>
0301   __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
0302                                  cms::cuda::AtomicPairCounter *apc2,  // just to zero them,
0303                                  HitsConstView<TrackerTraits> hh,
0304                                  GPUCACellT<TrackerTraits> *cells,
0305                                  uint32_t const *__restrict__ nCells,
0306                                  CellNeighborsVector<TrackerTraits> *cellNeighbors,
0307                                  OuterHitOfCell<TrackerTraits> const isOuterHitOfCell,
0308                                  CAParams<TrackerTraits> params) {
0309     using Cell = GPUCACellT<TrackerTraits>;
0310 
0311     auto firstCellIndex = threadIdx.y + blockIdx.y * blockDim.y;
0312     auto first = threadIdx.x;
0313     auto stride = blockDim.x;
0314 
0315     if (0 == (firstCellIndex + first)) {
0316       (*apc1) = 0;
0317       (*apc2) = 0;
0318     }  // ready for next kernel
0319 
0320     constexpr uint32_t last_bpix1_detIndex = TrackerTraits::last_bpix1_detIndex;
0321     constexpr uint32_t last_barrel_detIndex = TrackerTraits::last_barrel_detIndex;
0322     for (int idx = firstCellIndex, nt = (*nCells); idx < nt; idx += gridDim.y * blockDim.y) {
0323       auto cellIndex = idx;
0324       auto &thisCell = cells[idx];
0325       auto innerHitId = thisCell.inner_hit_id();
0326       if (int(innerHitId) < isOuterHitOfCell.offset)
0327         continue;
0328       int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
0329       auto vi = isOuterHitOfCell[innerHitId].data();
0330 
0331       auto ri = thisCell.inner_r(hh);
0332       auto zi = thisCell.inner_z(hh);
0333 
0334       auto ro = thisCell.outer_r(hh);
0335       auto zo = thisCell.outer_z(hh);
0336       auto isBarrel = thisCell.inner_detIndex(hh) < last_barrel_detIndex;
0337 
0338       for (int j = first; j < numberOfPossibleNeighbors; j += stride) {
0339         auto otherCell = __ldg(vi + j);
0340         auto &oc = cells[otherCell];
0341         auto r1 = oc.inner_r(hh);
0342         auto z1 = oc.inner_z(hh);
0343         bool aligned = Cell::areAlignedRZ(
0344             r1,
0345             z1,
0346             ri,
0347             zi,
0348             ro,
0349             zo,
0350             params.ptmin_,
0351             isBarrel ? params.CAThetaCutBarrel_ : params.CAThetaCutForward_);  // 2.f*thetaCut); // FIXME tune cuts
0352         if (aligned && thisCell.dcaCut(hh,
0353                                        oc,
0354                                        oc.inner_detIndex(hh) < last_bpix1_detIndex ? params.dcaCutInnerTriplet_
0355                                                                                    : params.dcaCutOuterTriplet_,
0356                                        params.hardCurvCut_)) {  // FIXME tune cuts
0357           oc.addOuterNeighbor(cellIndex, *cellNeighbors);
0358           thisCell.setStatusBits(Cell::StatusBit::kUsed);
0359           oc.setStatusBits(Cell::StatusBit::kUsed);
0360         }
0361       }  // loop on inner cells
0362     }    // loop on outer cells
0363   }
0364 
0365   template <typename TrackerTraits>
0366   __global__ void kernel_find_ntuplets(HitsConstView<TrackerTraits> hh,
0367                                        TkSoAView<TrackerTraits> tracks_view,
0368                                        GPUCACellT<TrackerTraits> *__restrict__ cells,
0369                                        uint32_t const *nCells,
0370                                        CellTracksVector<TrackerTraits> *cellTracks,
0371                                        cms::cuda::AtomicPairCounter *apc,
0372                                        CAParams<TrackerTraits> params) {
0373     // recursive: not obvious to widen
0374 
0375     using Cell = GPUCACellT<TrackerTraits>;
0376 
0377     auto first = threadIdx.x + blockIdx.x * blockDim.x;
0378 
0379 #ifdef GPU_DEBUG
0380     if (first == 0)
0381       printf("starting producing ntuplets from %d cells \n", *nCells);
0382 #endif
0383     for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0384       auto const &thisCell = cells[idx];
0385 
0386       if (thisCell.isKilled())
0387         continue;  // cut by earlyFishbone
0388 
0389       // we require at least three hits...
0390       if (thisCell.outerNeighbors().empty())
0391         continue;
0392 
0393       auto pid = thisCell.layerPairId();
0394       bool doit = params.startingLayerPair(pid);
0395 
0396       constexpr uint32_t maxDepth = TrackerTraits::maxDepth;
0397       if (doit) {
0398         typename Cell::TmpTuple stack;
0399         stack.reset();
0400 
0401         bool bpix1Start = params.startAt0(pid);
0402 
0403         thisCell.template find_ntuplets<maxDepth>(hh,
0404                                                   cells,
0405                                                   *cellTracks,
0406                                                   tracks_view.hitIndices(),
0407                                                   *apc,
0408                                                   tracks_view.quality(),
0409                                                   stack,
0410                                                   params.minHitsPerNtuplet_,
0411                                                   bpix1Start);
0412 
0413         assert(stack.empty());
0414       }
0415     }
0416   }
0417   template <typename TrackerTraits>
0418   __global__ void kernel_mark_used(GPUCACellT<TrackerTraits> *__restrict__ cells, uint32_t const *nCells) {
0419     auto first = threadIdx.x + blockIdx.x * blockDim.x;
0420     using Cell = GPUCACellT<TrackerTraits>;
0421     for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0422       auto &thisCell = cells[idx];
0423       if (!thisCell.tracks().empty())
0424         thisCell.setStatusBits(Cell::StatusBit::kInTrack);
0425     }
0426   }
0427 
0428   template <typename TrackerTraits>
0429   __global__ void kernel_countMultiplicity(TkSoAView<TrackerTraits> tracks_view,
0430                                            TupleMultiplicity<TrackerTraits> *tupleMultiplicity) {
0431     auto first = blockIdx.x * blockDim.x + threadIdx.x;
0432     for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) {
0433       auto nhits = tracks_view.hitIndices().size(it);
0434       if (nhits < 3)
0435         continue;
0436       if (tracks_view[it].quality() == pixelTrack::Quality::edup)
0437         continue;
0438       assert(tracks_view[it].quality() == pixelTrack::Quality::bad);
0439       if (nhits > TrackerTraits::maxHitsOnTrack)  // current limit
0440         printf("wrong mult %d %d\n", it, nhits);
0441       assert(nhits <= TrackerTraits::maxHitsOnTrack);
0442       tupleMultiplicity->count(nhits);
0443     }
0444   }
0445 
0446   template <typename TrackerTraits>
0447   __global__ void kernel_fillMultiplicity(TkSoAView<TrackerTraits> tracks_view,
0448                                           TupleMultiplicity<TrackerTraits> *tupleMultiplicity) {
0449     auto first = blockIdx.x * blockDim.x + threadIdx.x;
0450     for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) {
0451       auto nhits = tracks_view.hitIndices().size(it);
0452       if (nhits < 3)
0453         continue;
0454       if (tracks_view[it].quality() == pixelTrack::Quality::edup)
0455         continue;
0456       assert(tracks_view[it].quality() == pixelTrack::Quality::bad);
0457       if (nhits > TrackerTraits::maxHitsOnTrack)
0458         printf("wrong mult %d %d\n", it, nhits);
0459       assert(nhits <= TrackerTraits::maxHitsOnTrack);
0460       tupleMultiplicity->fill(nhits, it);
0461     }
0462   }
0463 
0464   ///TODO : why there was quality here?
0465   template <typename TrackerTraits>
0466   __global__ void kernel_classifyTracks(TkSoAView<TrackerTraits> tracks_view, QualityCuts<TrackerTraits> cuts) {
0467     // Quality *__restrict__ quality) {
0468     int first = blockDim.x * blockIdx.x + threadIdx.x;
0469     for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) {
0470       auto nhits = tracks_view.hitIndices().size(it);
0471       if (nhits == 0)
0472         break;  // guard
0473 
0474       // if duplicate: not even fit
0475       if (tracks_view[it].quality() == pixelTrack::Quality::edup)
0476         continue;
0477 
0478       assert(tracks_view[it].quality() == pixelTrack::Quality::bad);
0479 
0480       // mark doublets as bad
0481       if (nhits < 3)
0482         continue;
0483 
0484       // if the fit has any invalid parameters, mark it as bad
0485       bool isNaN = false;
0486       for (int i = 0; i < 5; ++i) {
0487         isNaN |= edm::isNotFinite(tracks_view[it].state()(i));
0488       }
0489       if (isNaN) {
0490 #ifdef NTUPLE_DEBUG
0491         printf("NaN in fit %d size %d chi2 %f\n", it, tracks_view.hitIndices().size(it), tracks_view[it].chi2());
0492 #endif
0493         continue;
0494       }
0495 
0496       tracks_view[it].quality() = pixelTrack::Quality::strict;
0497 
0498       if (cuts.strictCut(tracks_view, it))
0499         continue;
0500 
0501       tracks_view[it].quality() = pixelTrack::Quality::tight;
0502 
0503       if (cuts.isHP(tracks_view, nhits, it))
0504         tracks_view[it].quality() = pixelTrack::Quality::highPurity;
0505     }
0506   }
0507 
0508   template <typename TrackerTraits>
0509   __global__ void kernel_doStatsForTracks(TkSoAView<TrackerTraits> tracks_view, Counters *counters) {
0510     int first = blockDim.x * blockIdx.x + threadIdx.x;
0511     for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0512       if (tracks_view.hitIndices().size(idx) == 0)
0513         break;  //guard
0514       if (tracks_view[idx].quality() < pixelTrack::Quality::loose)
0515         continue;
0516       atomicAdd(&(counters->nLooseTracks), 1);
0517       if (tracks_view[idx].quality() < pixelTrack::Quality::strict)
0518         continue;
0519       atomicAdd(&(counters->nGoodTracks), 1);
0520     }
0521   }
0522 
0523   template <typename TrackerTraits>
0524   __global__ void kernel_countHitInTracks(TkSoAView<TrackerTraits> tracks_view, HitToTuple<TrackerTraits> *hitToTuple) {
0525     int first = blockDim.x * blockIdx.x + threadIdx.x;
0526     for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0527       if (tracks_view.hitIndices().size(idx) == 0)
0528         break;  // guard
0529       for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h)
0530         hitToTuple->count(*h);
0531     }
0532   }
0533 
0534   template <typename TrackerTraits>
0535   __global__ void kernel_fillHitInTracks(TkSoAView<TrackerTraits> tracks_view, HitToTuple<TrackerTraits> *hitToTuple) {
0536     int first = blockDim.x * blockIdx.x + threadIdx.x;
0537     for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0538       if (tracks_view.hitIndices().size(idx) == 0)
0539         break;  // guard
0540       for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h)
0541         hitToTuple->fill(*h, idx);
0542     }
0543   }
0544 
0545   template <typename TrackerTraits>
0546   __global__ void kernel_fillHitDetIndices(TkSoAView<TrackerTraits> tracks_view, HitsConstView<TrackerTraits> hh) {
0547     int first = blockDim.x * blockIdx.x + threadIdx.x;
0548     // copy offsets
0549     for (int idx = first, ntot = tracks_view.hitIndices().totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0550       tracks_view.detIndices().off[idx] = tracks_view.hitIndices().off[idx];
0551     }
0552     // fill hit indices
0553     auto nhits = hh.nHits();
0554 
0555     for (int idx = first, ntot = tracks_view.hitIndices().size(); idx < ntot; idx += gridDim.x * blockDim.x) {
0556       assert(tracks_view.hitIndices().content[idx] < nhits);
0557       tracks_view.detIndices().content[idx] = hh[tracks_view.hitIndices().content[idx]].detectorIndex();
0558     }
0559   }
0560 
0561   template <typename TrackerTraits>
0562   __global__ void kernel_fillNLayers(TkSoAView<TrackerTraits> tracks_view, cms::cuda::AtomicPairCounter *apc) {
0563     auto first = blockIdx.x * blockDim.x + threadIdx.x;
0564     // clamp the number of tracks to the capacity of the SoA
0565     auto ntracks = std::min<int>(apc->get().m, tracks_view.metadata().size() - 1);
0566     if (0 == first)
0567       tracks_view.nTracks() = ntracks;
0568     for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) {
0569       auto nHits = TracksUtilities<TrackerTraits>::nHits(tracks_view, idx);
0570       assert(nHits >= 3);
0571       tracks_view[idx].nLayers() = TracksUtilities<TrackerTraits>::computeNumberOfLayers(tracks_view, idx);
0572     }
0573   }
0574 
0575   template <typename TrackerTraits>
0576   __global__ void kernel_doStatsForHitInTracks(HitToTuple<TrackerTraits> const *__restrict__ hitToTuple,
0577                                                Counters *counters) {
0578     auto &c = *counters;
0579     int first = blockDim.x * blockIdx.x + threadIdx.x;
0580     for (int idx = first, ntot = hitToTuple->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0581       if (hitToTuple->size(idx) == 0)
0582         continue;  // SHALL NOT BE break
0583       atomicAdd(&c.nUsedHits, 1);
0584       if (hitToTuple->size(idx) > 1)
0585         atomicAdd(&c.nDupHits, 1);
0586     }
0587   }
0588 
0589   template <typename TrackerTraits>
0590   __global__ void kernel_countSharedHit(int *__restrict__ nshared,
0591                                         HitContainer<TrackerTraits> const *__restrict__ ptuples,
0592                                         Quality const *__restrict__ quality,
0593                                         HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) {
0594     constexpr auto loose = pixelTrack::Quality::loose;
0595 
0596     auto &hitToTuple = *phitToTuple;
0597     auto const &foundNtuplets = *ptuples;
0598 
0599     int first = blockDim.x * blockIdx.x + threadIdx.x;
0600     for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0601       if (hitToTuple.size(idx) < 2)
0602         continue;
0603 
0604       int nt = 0;
0605 
0606       // count "good" tracks
0607       for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
0608         if (quality[*it] < loose)
0609           continue;
0610         ++nt;
0611       }
0612 
0613       if (nt < 2)
0614         continue;
0615 
0616       // now mark  each track triplet as sharing a hit
0617       for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
0618         if (foundNtuplets.size(*it) > 3)
0619           continue;
0620         atomicAdd(&nshared[*it], 1);
0621       }
0622 
0623     }  //  hit loop
0624   }
0625 
0626   template <typename TrackerTraits>
0627   __global__ void kernel_markSharedHit(int const *__restrict__ nshared,
0628                                        HitContainer<TrackerTraits> const *__restrict__ tuples,
0629                                        Quality *__restrict__ quality,
0630                                        bool dupPassThrough) {
0631     // constexpr auto bad = pixelTrack::Quality::bad;
0632     constexpr auto dup = pixelTrack::Quality::dup;
0633     constexpr auto loose = pixelTrack::Quality::loose;
0634     // constexpr auto strict = pixelTrack::Quality::strict;
0635 
0636     // quality to mark rejected
0637     auto const reject = dupPassThrough ? loose : dup;
0638 
0639     int first = blockDim.x * blockIdx.x + threadIdx.x;
0640     for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0641       if (tuples->size(idx) == 0)
0642         break;  //guard
0643       if (quality[idx] <= reject)
0644         continue;
0645       if (nshared[idx] > 2)
0646         quality[idx] = reject;
0647     }
0648   }
0649 
0650   // mostly for very forward triplets.....
0651   template <typename TrackerTraits>
0652   __global__ void kernel_rejectDuplicate(TkSoAView<TrackerTraits> tracks_view,
0653                                          uint16_t nmin,
0654                                          bool dupPassThrough,
0655                                          HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) {
0656     // quality to mark rejected
0657     auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
0658 
0659     auto &hitToTuple = *phitToTuple;
0660 
0661     int first = blockDim.x * blockIdx.x + threadIdx.x;
0662     for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0663       if (hitToTuple.size(idx) < 2)
0664         continue;
0665 
0666       auto score = [&](auto it, auto nl) { return std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it)); };
0667 
0668       // full combinatorics
0669       for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) {
0670         auto const it = *ip;
0671         auto qi = tracks_view[it].quality();
0672         if (qi <= reject)
0673           continue;
0674         auto opi = tracks_view[it].state()(2);
0675         auto e2opi = tracks_view[it].covariance()(9);
0676         auto cti = tracks_view[it].state()(3);
0677         auto e2cti = tracks_view[it].covariance()(12);
0678         auto nli = tracks_view[it].nLayers();
0679         for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) {
0680           auto const jt = *jp;
0681           auto qj = tracks_view[jt].quality();
0682           if (qj <= reject)
0683             continue;
0684           auto opj = tracks_view[jt].state()(2);
0685           auto ctj = tracks_view[jt].state()(3);
0686           auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti);
0687           if ((cti - ctj) * (cti - ctj) > dct)
0688             continue;
0689           auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi);
0690           if ((opi - opj) * (opi - opj) > dop)
0691             continue;
0692           auto nlj = tracks_view[jt].nLayers();
0693           if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj)))))
0694             tracks_view[jt].quality() = reject;
0695           else {
0696             tracks_view[it].quality() = reject;
0697             break;
0698           }
0699         }
0700       }
0701     }
0702   }
0703 
0704   template <typename TrackerTraits>
0705   __global__ void kernel_sharedHitCleaner(HitsConstView<TrackerTraits> hh,
0706                                           TkSoAView<TrackerTraits> tracks_view,
0707                                           int nmin,
0708                                           bool dupPassThrough,
0709                                           HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) {
0710     // quality to mark rejected
0711     auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
0712     // quality of longest track
0713     auto const longTqual = pixelTrack::Quality::highPurity;
0714 
0715     auto &hitToTuple = *phitToTuple;
0716 
0717     int l1end = hh.hitsLayerStart()[1];
0718 
0719     int first = blockDim.x * blockIdx.x + threadIdx.x;
0720     for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0721       if (hitToTuple.size(idx) < 2)
0722         continue;
0723 
0724       int8_t maxNl = 0;
0725 
0726       // find maxNl
0727       for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
0728         if (tracks_view[*it].quality() < longTqual)
0729           continue;
0730         // if (tracks_view[*it].nHits()==3) continue;
0731         auto nl = tracks_view[*it].nLayers();
0732         maxNl = std::max(nl, maxNl);
0733       }
0734 
0735       if (maxNl < 4)
0736         continue;
0737 
0738       // quad pass through (leave for tests)
0739       // maxNl = std::min(4, maxNl);
0740 
0741       // kill all tracks shorter than maxHl (only triplets???
0742       for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
0743         auto nl = tracks_view[*it].nLayers();
0744 
0745         //checking if shared hit is on bpix1 and if the tuple is short enough
0746         if (idx < l1end and nl > nmin)
0747           continue;
0748 
0749         if (nl < maxNl && tracks_view[*it].quality() > reject)
0750           tracks_view[*it].quality() = reject;
0751       }
0752     }
0753   }
0754 
0755   template <typename TrackerTraits>
0756   __global__ void kernel_tripletCleaner(TkSoAView<TrackerTraits> tracks_view,
0757                                         uint16_t nmin,
0758                                         bool dupPassThrough,
0759                                         HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) {
0760     // quality to mark rejected
0761     auto const reject = pixelTrack::Quality::loose;
0762     /// min quality of good
0763     auto const good = pixelTrack::Quality::strict;
0764 
0765     auto &hitToTuple = *phitToTuple;
0766 
0767     int first = blockDim.x * blockIdx.x + threadIdx.x;
0768     for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0769       if (hitToTuple.size(idx) < 2)
0770         continue;
0771 
0772       float mc = maxScore;
0773       uint16_t im = tkNotFound;
0774       bool onlyTriplets = true;
0775 
0776       // check if only triplets
0777       for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
0778         if (tracks_view[*it].quality() <= good)
0779           continue;
0780         onlyTriplets &= TracksUtilities<TrackerTraits>::isTriplet(tracks_view, *it);
0781         if (!onlyTriplets)
0782           break;
0783       }
0784 
0785       // only triplets
0786       if (!onlyTriplets)
0787         continue;
0788 
0789       // for triplets choose best tip!  (should we first find best quality???)
0790       for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
0791         auto const it = *ip;
0792         if (tracks_view[it].quality() >= good && std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it)) < mc) {
0793           mc = std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it));
0794           im = it;
0795         }
0796       }
0797 
0798       if (tkNotFound == im)
0799         continue;
0800 
0801       // mark worse ambiguities
0802       for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
0803         auto const it = *ip;
0804         if (tracks_view[it].quality() > reject && it != im)
0805           tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
0806       }
0807 
0808     }  // loop over hits
0809   }
0810 
0811   template <typename TrackerTraits>
0812   __global__ void kernel_simpleTripletCleaner(TkSoAView<TrackerTraits> tracks_view,
0813                                               uint16_t nmin,
0814                                               bool dupPassThrough,
0815                                               HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) {
0816     // quality to mark rejected
0817     auto const reject = pixelTrack::Quality::loose;
0818     /// min quality of good
0819     auto const good = pixelTrack::Quality::loose;
0820 
0821     auto &hitToTuple = *phitToTuple;
0822 
0823     int first = blockDim.x * blockIdx.x + threadIdx.x;
0824     for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0825       if (hitToTuple.size(idx) < 2)
0826         continue;
0827 
0828       float mc = maxScore;
0829       uint16_t im = tkNotFound;
0830 
0831       // choose best tip!  (should we first find best quality???)
0832       for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
0833         auto const it = *ip;
0834         if (tracks_view[it].quality() >= good && std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it)) < mc) {
0835           mc = std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it));
0836           im = it;
0837         }
0838       }
0839 
0840       if (tkNotFound == im)
0841         continue;
0842 
0843       // mark worse ambiguities
0844       for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
0845         auto const it = *ip;
0846         if (tracks_view[it].quality() > reject && TracksUtilities<TrackerTraits>::isTriplet(tracks_view, it) &&
0847             it != im)
0848           tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
0849       }
0850 
0851     }  // loop over hits
0852   }
0853 
0854   template <typename TrackerTraits>
0855   __global__ void kernel_print_found_ntuplets(HitsConstView<TrackerTraits> hh,
0856                                               TkSoAView<TrackerTraits> tracks_view,
0857                                               HitToTuple<TrackerTraits> const *__restrict__ phitToTuple,
0858                                               int32_t firstPrint,
0859                                               int32_t lastPrint,
0860                                               int iev) {
0861     constexpr auto loose = pixelTrack::Quality::loose;
0862 
0863     int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x;
0864     for (int i = first, np = std::min(lastPrint, tracks_view.hitIndices().nOnes()); i < np;
0865          i += blockDim.x * gridDim.x) {
0866       auto nh = tracks_view.hitIndices().size(i);
0867       if (nh < 3)
0868         continue;
0869       if (tracks_view[i].quality() < loose)
0870         continue;
0871       printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n",
0872              10000 * iev + i,
0873              int(tracks_view[i].quality()),
0874              nh,
0875              tracks_view[i].nLayers(),
0876              TracksUtilities<TrackerTraits>::charge(tracks_view, i),
0877              tracks_view[i].pt(),
0878              tracks_view[i].eta(),
0879              TracksUtilities<TrackerTraits>::phi(tracks_view, i),
0880              TracksUtilities<TrackerTraits>::tip(tracks_view, i),
0881              TracksUtilities<TrackerTraits>::zip(tracks_view, i),
0882              tracks_view[i].chi2(),
0883              hh[*tracks_view.hitIndices().begin(i)].zGlobal(),
0884              hh[*(tracks_view.hitIndices().begin(i) + 1)].zGlobal(),
0885              hh[*(tracks_view.hitIndices().begin(i) + 2)].zGlobal(),
0886              nh > 3 ? hh[int(*(tracks_view.hitIndices().begin(i) + 3))].zGlobal() : 0,
0887              nh > 4 ? hh[int(*(tracks_view.hitIndices().begin(i) + 4))].zGlobal() : 0,
0888              nh > 5 ? hh[int(*(tracks_view.hitIndices().begin(i) + 5))].zGlobal() : 0,
0889              nh > 6 ? hh[int(*(tracks_view.hitIndices().begin(i) + nh - 1))].zGlobal() : 0);
0890     }
0891   }
0892 
0893   __global__ void kernel_printCounters(Counters const *counters) {
0894     auto const &c = *counters;
0895     printf(
0896         "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks  |  nLooseTracks  |  nGoodTracks | nUsedHits | "
0897         "nDupHits | "
0898         "nFishCells | "
0899         "nKilledCells | "
0900         "nUsedCells | nZeroTrackCells ||\n");
0901     printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
0902            c.nEvents,
0903            c.nHits,
0904            c.nCells,
0905            c.nTuples,
0906            c.nFitTracks,
0907            c.nLooseTracks,
0908            c.nGoodTracks,
0909            c.nUsedHits,
0910            c.nDupHits,
0911            c.nFishCells,
0912            c.nKilledCells,
0913            c.nEmptyCells,
0914            c.nZeroTrackCells);
0915     printf(
0916         "Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f|  %.3f|  %.3f||\n",
0917         c.nEvents,
0918         c.nHits / double(c.nEvents),
0919         c.nCells / double(c.nEvents),
0920         c.nTuples / double(c.nEvents),
0921         c.nFitTracks / double(c.nEvents),
0922         c.nLooseTracks / double(c.nEvents),
0923         c.nGoodTracks / double(c.nEvents),
0924         c.nUsedHits / double(c.nEvents),
0925         c.nDupHits / double(c.nEvents),
0926         c.nFishCells / double(c.nCells),
0927         c.nKilledCells / double(c.nCells),
0928         c.nEmptyCells / double(c.nCells),
0929         c.nZeroTrackCells / double(c.nCells));
0930   }
0931 
0932 }  // namespace caHitNtupletGeneratorKernels