Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 12:28:33

0001 //
0002 // Original Author: Felice Pantaleo, CERN
0003 //
0004 
0005 //#define NTUPLE_DEBUG
0006 //#define GPU_DEBUG
0007 
0008 #include <cmath>
0009 #include <cstdint>
0010 #include <limits>
0011 
0012 #include <cuda_runtime.h>
0013 
0014 #include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
0015 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
0016 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0017 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0018 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
0019 
0020 #include "CAHitNtupletGeneratorKernels.h"
0021 #include "CAStructures.h"
0022 #include "GPUCACell.h"
0023 #include "gpuFishbone.h"
0024 #include "gpuPixelDoublets.h"
0025 
0026 namespace caHitNtupletGeneratorKernels {
0027 
0028   constexpr uint32_t tkNotFound = std::numeric_limits<uint16_t>::max();
0029   constexpr float maxScore = std::numeric_limits<float>::max();
0030   constexpr float nSigma2 = 25.f;
0031 
0032   //all of these below are mostly to avoid brining around the relative namespace
0033 
0034   template <typename TrackerTraits>
0035   using HitToTuple = caStructures::HitToTupleT<TrackerTraits>;
0036 
0037   template <typename TrackerTraits>
0038   using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
0039 
0040   template <typename TrackerTraits>
0041   using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
0042 
0043   template <typename TrackerTraits>
0044   using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
0045 
0046   template <typename TrackerTraits>
0047   using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
0048 
0049   using Quality = pixelTrack::Quality;
0050 
0051   template <typename TrackerTraits>
0052   using TkSoAView = TrackSoAView<TrackerTraits>;
0053 
0054   template <typename TrackerTraits>
0055   using HitContainer = typename TrackSoA<TrackerTraits>::HitContainer;
0056 
0057   template <typename TrackerTraits>
0058   using HitsConstView = typename GPUCACellT<TrackerTraits>::HitsConstView;
0059 
0060   template <typename TrackerTraits>
0061   using QualityCuts = pixelTrack::QualityCutsT<TrackerTraits>;
0062 
0063   template <typename TrackerTraits>
0064   using CAParams = caHitNtupletGenerator::CAParamsT<TrackerTraits>;
0065 
0066   using Counters = caHitNtupletGenerator::Counters;
0067 
0068   template <typename TrackerTraits>
0069   __global__ void kernel_checkOverflows(TkSoAView<TrackerTraits> tracks_view,
0070                                         TupleMultiplicity<TrackerTraits> const *tupleMultiplicity,
0071                                         HitToTuple<TrackerTraits> const *hitToTuple,
0072                                         cms::cuda::AtomicPairCounter *apc,
0073                                         GPUCACellT<TrackerTraits> const *__restrict__ cells,
0074                                         uint32_t const *__restrict__ nCells,
0075                                         CellNeighborsVector<TrackerTraits> const *cellNeighbors,
0076                                         CellTracksVector<TrackerTraits> const *cellTracks,
0077                                         OuterHitOfCell<TrackerTraits> const isOuterHitOfCell,
0078                                         int32_t nHits,
0079                                         uint32_t maxNumberOfDoublets,
0080                                         Counters *counters) {
0081     auto first = threadIdx.x + blockIdx.x * blockDim.x;
0082 
0083     auto &c = *counters;
0084     // counters once per event
0085     if (0 == first) {
0086       atomicAdd(&c.nEvents, 1);
0087       atomicAdd(&c.nHits, nHits);
0088       atomicAdd(&c.nCells, *nCells);
0089       atomicAdd(&c.nTuples, apc->get().m);
0090       atomicAdd(&c.nFitTracks, tupleMultiplicity->size());
0091     }
0092 
0093 #ifdef NTUPLE_DEBUG
0094     if (0 == first) {
0095       printf("number of found cells %d \n found tuples %d with total hits %d out of %d %d\n",
0096              *nCells,
0097              apc->get().m,
0098              apc->get().n,
0099              nHits,
0100              hitToTuple->totOnes());
0101       if (apc->get().m < TrackerTraits::maxNumberOfQuadruplets) {
0102         assert(tracks_view.hitIndices().size(apc->get().m) == 0);
0103         assert(tracks_view.hitIndices().size() == apc->get().n);
0104       }
0105     }
0106 
0107     for (int idx = first, nt = tracks_view.hitIndices().nOnes(); idx < nt; idx += gridDim.x * blockDim.x) {
0108       if (tracks_view.hitIndices().size(idx) > TrackerTraits::maxHitsOnTrack)  // current real limit
0109         printf("ERROR %d, %d\n", idx, tracks_view.hitIndices().size(idx));
0110       assert(tracks_view.hitIndices().size(idx) <= TrackerTraits::maxHitsOnTrack);
0111       for (auto ih = tracks_view.hitIndices().begin(idx); ih != tracks_view.hitIndices().end(idx); ++ih)
0112         assert(int(*ih) < nHits);
0113     }
0114 #endif
0115 
0116     if (0 == first) {
0117       if (apc->get().m >= TrackerTraits::maxNumberOfQuadruplets)
0118         printf("Tuples overflow\n");
0119       if (*nCells >= maxNumberOfDoublets)
0120         printf("Cells overflow\n");
0121       if (cellNeighbors && cellNeighbors->full())
0122         printf("cellNeighbors overflow %d %d \n", cellNeighbors->capacity(), cellNeighbors->size());
0123       if (cellTracks && cellTracks->full())
0124         printf("cellTracks overflow\n");
0125       if (int(hitToTuple->nOnes()) < nHits)
0126         printf("ERROR hitToTuple  overflow %d %d\n", hitToTuple->nOnes(), nHits);
0127 #ifdef GPU_DEBUG
0128       printf("size of cellNeighbors %d \n cellTracks %d \n hitToTuple %d \n",
0129              cellNeighbors->size(),
0130              cellTracks->size(),
0131              hitToTuple->size());
0132 
0133 #endif
0134     }
0135 
0136     for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0137       auto const &thisCell = cells[idx];
0138       if (thisCell.hasFishbone() && !thisCell.isKilled())
0139         atomicAdd(&c.nFishCells, 1);
0140       if (thisCell.outerNeighbors().full())  //++tooManyNeighbors[thisCell.theLayerPairId];
0141         printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId());
0142       if (thisCell.tracks().full())  //++tooManyTracks[thisCell.theLayerPairId];
0143         printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId());
0144       if (thisCell.isKilled())
0145         atomicAdd(&c.nKilledCells, 1);
0146       if (!thisCell.unused())
0147         atomicAdd(&c.nEmptyCells, 1);
0148       if ((0 == hitToTuple->size(thisCell.inner_hit_id())) && (0 == hitToTuple->size(thisCell.outer_hit_id())))
0149         atomicAdd(&c.nZeroTrackCells, 1);
0150     }
0151 
0152     for (int idx = first, nt = nHits - isOuterHitOfCell.offset; idx < nt; idx += gridDim.x * blockDim.x) {
0153       if (isOuterHitOfCell.container[idx].full())  // ++tooManyOuterHitOfCell;
0154         printf("OuterHitOfCell overflow %d\n", idx);
0155     }
0156   }
0157 
0158   template <typename TrackerTraits>
0159   __global__ void kernel_fishboneCleaner(GPUCACellT<TrackerTraits> const *cells,
0160                                          uint32_t const *__restrict__ nCells,
0161                                          TkSoAView<TrackerTraits> tracks_view) {
0162     constexpr auto reject = pixelTrack::Quality::dup;
0163 
0164     auto first = threadIdx.x + blockIdx.x * blockDim.x;
0165     for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0166       auto const &thisCell = cells[idx];
0167       if (!thisCell.isKilled())
0168         continue;
0169 
0170       for (auto it : thisCell.tracks())
0171         tracks_view[it].quality() = reject;
0172     }
0173   }
0174 
0175   // remove shorter tracks if sharing a cell
0176   // It does not seem to affect efficiency in any way!
0177   template <typename TrackerTraits>
0178   __global__ void kernel_earlyDuplicateRemover(GPUCACellT<TrackerTraits> const *cells,
0179                                                uint32_t const *__restrict__ nCells,
0180                                                TkSoAView<TrackerTraits> tracks_view,
0181                                                bool dupPassThrough) {
0182     // quality to mark rejected
0183     constexpr auto reject = pixelTrack::Quality::edup;  /// cannot be loose
0184 
0185     assert(nCells);
0186     auto first = threadIdx.x + blockIdx.x * blockDim.x;
0187     for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0188       auto const &thisCell = cells[idx];
0189 
0190       if (thisCell.tracks().size() < 2)
0191         continue;
0192 
0193       int8_t maxNl = 0;
0194 
0195       // find maxNl
0196       for (auto it : thisCell.tracks()) {
0197         auto nl = tracks_view[it].nLayers();
0198         maxNl = std::max(nl, maxNl);
0199       }
0200 
0201       // if (maxNl<4) continue;
0202       // quad pass through (leave it her for tests)
0203       //  maxNl = std::min(4, maxNl);
0204 
0205       for (auto it : thisCell.tracks()) {
0206         if (tracks_view[it].nLayers() < maxNl)
0207           tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
0208       }
0209     }
0210   }
0211 
0212   // assume the above (so, short tracks already removed)
0213   template <typename TrackerTraits>
0214   __global__ void kernel_fastDuplicateRemover(GPUCACellT<TrackerTraits> const *__restrict__ cells,
0215                                               uint32_t const *__restrict__ nCells,
0216                                               TkSoAView<TrackerTraits> tracks_view,
0217                                               bool dupPassThrough) {
0218     // quality to mark rejected
0219     auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
0220     constexpr auto loose = pixelTrack::Quality::loose;
0221 
0222     assert(nCells);
0223 
0224     auto first = threadIdx.x + blockIdx.x * blockDim.x;
0225     for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0226       auto const &thisCell = cells[idx];
0227       if (thisCell.tracks().size() < 2)
0228         continue;
0229 
0230       float mc = maxScore;
0231       uint16_t im = tkNotFound;
0232 
0233       auto score = [&](auto it) { return std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it)); };
0234 
0235       // full crazy combinatorics
0236       // full crazy combinatorics
0237       int ntr = thisCell.tracks().size();
0238       for (int i = 0; i < ntr - 1; ++i) {
0239         auto it = thisCell.tracks()[i];
0240         auto qi = tracks_view[it].quality();
0241         if (qi <= reject)
0242           continue;
0243         auto opi = tracks_view[it].state()(2);
0244         auto e2opi = tracks_view[it].covariance()(9);
0245         auto cti = tracks_view[it].state()(3);
0246         auto e2cti = tracks_view[it].covariance()(12);
0247         for (auto j = i + 1; j < ntr; ++j) {
0248           auto jt = thisCell.tracks()[j];
0249           auto qj = tracks_view[jt].quality();
0250           if (qj <= reject)
0251             continue;
0252           auto opj = tracks_view[jt].state()(2);
0253           auto ctj = tracks_view[jt].state()(3);
0254           auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti);
0255           if ((cti - ctj) * (cti - ctj) > dct)
0256             continue;
0257           auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi);
0258           if ((opi - opj) * (opi - opj) > dop)
0259             continue;
0260           if ((qj < qi) || (qj == qi && score(it) < score(jt)))
0261             tracks_view[jt].quality() = reject;
0262           else {
0263             tracks_view[it].quality() = reject;
0264             break;
0265           }
0266         }
0267       }
0268 
0269       // find maxQual
0270       auto maxQual = reject;  // no duplicate!
0271       for (auto it : thisCell.tracks()) {
0272         if (tracks_view[it].quality() > maxQual)
0273           maxQual = tracks_view[it].quality();
0274       }
0275 
0276       if (maxQual <= loose)
0277         continue;
0278 
0279       // find min score
0280       for (auto it : thisCell.tracks()) {
0281         if (tracks_view[it].quality() == maxQual && score(it) < mc) {
0282           mc = score(it);
0283           im = it;
0284         }
0285       }
0286 
0287       if (tkNotFound == im)
0288         continue;
0289 
0290       // mark all other duplicates  (not yet, keep it loose)
0291       for (auto it : thisCell.tracks()) {
0292         if (tracks_view[it].quality() > loose && it != im)
0293           tracks_view[it].quality() = loose;  //no race:  simple assignment of the same constant
0294       }
0295     }
0296   }
0297 
0298   template <typename TrackerTraits>
0299   __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
0300                                  cms::cuda::AtomicPairCounter *apc2,  // just to zero them,
0301                                  HitsConstView<TrackerTraits> hh,
0302                                  GPUCACellT<TrackerTraits> *cells,
0303                                  uint32_t const *__restrict__ nCells,
0304                                  CellNeighborsVector<TrackerTraits> *cellNeighbors,
0305                                  OuterHitOfCell<TrackerTraits> const isOuterHitOfCell,
0306                                  CAParams<TrackerTraits> params) {
0307     using Cell = GPUCACellT<TrackerTraits>;
0308 
0309     auto firstCellIndex = threadIdx.y + blockIdx.y * blockDim.y;
0310     auto first = threadIdx.x;
0311     auto stride = blockDim.x;
0312 
0313     if (0 == (firstCellIndex + first)) {
0314       (*apc1) = 0;
0315       (*apc2) = 0;
0316     }  // ready for next kernel
0317 
0318     constexpr uint32_t last_bpix1_detIndex = TrackerTraits::last_bpix1_detIndex;
0319     constexpr uint32_t last_barrel_detIndex = TrackerTraits::last_barrel_detIndex;
0320     for (int idx = firstCellIndex, nt = (*nCells); idx < nt; idx += gridDim.y * blockDim.y) {
0321       auto cellIndex = idx;
0322       auto &thisCell = cells[idx];
0323       auto innerHitId = thisCell.inner_hit_id();
0324       if (int(innerHitId) < isOuterHitOfCell.offset)
0325         continue;
0326       int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
0327       auto vi = isOuterHitOfCell[innerHitId].data();
0328 
0329       auto ri = thisCell.inner_r(hh);
0330       auto zi = thisCell.inner_z(hh);
0331 
0332       auto ro = thisCell.outer_r(hh);
0333       auto zo = thisCell.outer_z(hh);
0334       auto isBarrel = thisCell.inner_detIndex(hh) < last_barrel_detIndex;
0335 
0336       for (int j = first; j < numberOfPossibleNeighbors; j += stride) {
0337         auto otherCell = __ldg(vi + j);
0338         auto &oc = cells[otherCell];
0339         auto r1 = oc.inner_r(hh);
0340         auto z1 = oc.inner_z(hh);
0341         bool aligned = Cell::areAlignedRZ(
0342             r1,
0343             z1,
0344             ri,
0345             zi,
0346             ro,
0347             zo,
0348             params.ptmin_,
0349             isBarrel ? params.CAThetaCutBarrel_ : params.CAThetaCutForward_);  // 2.f*thetaCut); // FIXME tune cuts
0350         if (aligned && thisCell.dcaCut(hh,
0351                                        oc,
0352                                        oc.inner_detIndex(hh) < last_bpix1_detIndex ? params.dcaCutInnerTriplet_
0353                                                                                    : params.dcaCutOuterTriplet_,
0354                                        params.hardCurvCut_)) {  // FIXME tune cuts
0355           oc.addOuterNeighbor(cellIndex, *cellNeighbors);
0356           thisCell.setStatusBits(Cell::StatusBit::kUsed);
0357           oc.setStatusBits(Cell::StatusBit::kUsed);
0358         }
0359       }  // loop on inner cells
0360     }    // loop on outer cells
0361   }
0362 
0363   template <typename TrackerTraits>
0364   __global__ void kernel_find_ntuplets(HitsConstView<TrackerTraits> hh,
0365                                        TkSoAView<TrackerTraits> tracks_view,
0366                                        GPUCACellT<TrackerTraits> *__restrict__ cells,
0367                                        uint32_t const *nCells,
0368                                        CellTracksVector<TrackerTraits> *cellTracks,
0369                                        cms::cuda::AtomicPairCounter *apc,
0370                                        CAParams<TrackerTraits> params) {
0371     // recursive: not obvious to widen
0372 
0373     using Cell = GPUCACellT<TrackerTraits>;
0374 
0375     auto first = threadIdx.x + blockIdx.x * blockDim.x;
0376 
0377 #ifdef GPU_DEBUG
0378     if (first == 0)
0379       printf("starting producing ntuplets from %d cells \n", *nCells);
0380 #endif
0381     for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0382       auto const &thisCell = cells[idx];
0383 
0384       if (thisCell.isKilled())
0385         continue;  // cut by earlyFishbone
0386 
0387       // we require at least three hits...
0388       if (thisCell.outerNeighbors().empty())
0389         continue;
0390 
0391       auto pid = thisCell.layerPairId();
0392       bool doit = params.startingLayerPair(pid);
0393 
0394       constexpr uint32_t maxDepth = TrackerTraits::maxDepth;
0395       if (doit) {
0396         typename Cell::TmpTuple stack;
0397         stack.reset();
0398 
0399         bool bpix1Start = params.startAt0(pid);
0400 
0401         thisCell.template find_ntuplets<maxDepth>(hh,
0402                                                   cells,
0403                                                   *cellTracks,
0404                                                   tracks_view.hitIndices(),
0405                                                   *apc,
0406                                                   tracks_view.quality(),
0407                                                   stack,
0408                                                   params.minHitsPerNtuplet_,
0409                                                   bpix1Start);
0410 
0411         assert(stack.empty());
0412       }
0413     }
0414   }
0415   template <typename TrackerTraits>
0416   __global__ void kernel_mark_used(GPUCACellT<TrackerTraits> *__restrict__ cells, uint32_t const *nCells) {
0417     auto first = threadIdx.x + blockIdx.x * blockDim.x;
0418     using Cell = GPUCACellT<TrackerTraits>;
0419     for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0420       auto &thisCell = cells[idx];
0421       if (!thisCell.tracks().empty())
0422         thisCell.setStatusBits(Cell::StatusBit::kInTrack);
0423     }
0424   }
0425 
0426   template <typename TrackerTraits>
0427   __global__ void kernel_countMultiplicity(TkSoAView<TrackerTraits> tracks_view,
0428                                            TupleMultiplicity<TrackerTraits> *tupleMultiplicity) {
0429     auto first = blockIdx.x * blockDim.x + threadIdx.x;
0430     for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) {
0431       auto nhits = tracks_view.hitIndices().size(it);
0432       if (nhits < 3)
0433         continue;
0434       if (tracks_view[it].quality() == pixelTrack::Quality::edup)
0435         continue;
0436       assert(tracks_view[it].quality() == pixelTrack::Quality::bad);
0437       if (nhits > TrackerTraits::maxHitsOnTrack)  // current limit
0438         printf("wrong mult %d %d\n", it, nhits);
0439       assert(nhits <= TrackerTraits::maxHitsOnTrack);
0440       tupleMultiplicity->count(nhits);
0441     }
0442   }
0443 
0444   template <typename TrackerTraits>
0445   __global__ void kernel_fillMultiplicity(TkSoAView<TrackerTraits> tracks_view,
0446                                           TupleMultiplicity<TrackerTraits> *tupleMultiplicity) {
0447     auto first = blockIdx.x * blockDim.x + threadIdx.x;
0448     for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) {
0449       auto nhits = tracks_view.hitIndices().size(it);
0450       if (nhits < 3)
0451         continue;
0452       if (tracks_view[it].quality() == pixelTrack::Quality::edup)
0453         continue;
0454       assert(tracks_view[it].quality() == pixelTrack::Quality::bad);
0455       if (nhits > TrackerTraits::maxHitsOnTrack)
0456         printf("wrong mult %d %d\n", it, nhits);
0457       assert(nhits <= TrackerTraits::maxHitsOnTrack);
0458       tupleMultiplicity->fill(nhits, it);
0459     }
0460   }
0461 
0462   ///TODO : why there was quality here?
0463   template <typename TrackerTraits>
0464   __global__ void kernel_classifyTracks(TkSoAView<TrackerTraits> tracks_view, QualityCuts<TrackerTraits> cuts) {
0465     // Quality *__restrict__ quality) {
0466     int first = blockDim.x * blockIdx.x + threadIdx.x;
0467     for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) {
0468       auto nhits = tracks_view.hitIndices().size(it);
0469       if (nhits == 0)
0470         break;  // guard
0471 
0472       // if duplicate: not even fit
0473       if (tracks_view[it].quality() == pixelTrack::Quality::edup)
0474         continue;
0475 
0476       assert(tracks_view[it].quality() == pixelTrack::Quality::bad);
0477 
0478       // mark doublets as bad
0479       if (nhits < 3)
0480         continue;
0481 
0482       // if the fit has any invalid parameters, mark it as bad
0483       bool isNaN = false;
0484       for (int i = 0; i < 5; ++i) {
0485         isNaN |= std::isnan(tracks_view[it].state()(i));
0486       }
0487       if (isNaN) {
0488 #ifdef NTUPLE_DEBUG
0489         printf("NaN in fit %d size %d chi2 %f\n", it, tracks_view.hitIndices().size(it), tracks_view[it].chi2());
0490 #endif
0491         continue;
0492       }
0493 
0494       tracks_view[it].quality() = pixelTrack::Quality::strict;
0495 
0496       if (cuts.strictCut(tracks_view, it))
0497         continue;
0498 
0499       tracks_view[it].quality() = pixelTrack::Quality::tight;
0500 
0501       if (cuts.isHP(tracks_view, nhits, it))
0502         tracks_view[it].quality() = pixelTrack::Quality::highPurity;
0503     }
0504   }
0505 
0506   template <typename TrackerTraits>
0507   __global__ void kernel_doStatsForTracks(TkSoAView<TrackerTraits> tracks_view, Counters *counters) {
0508     int first = blockDim.x * blockIdx.x + threadIdx.x;
0509     for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0510       if (tracks_view.hitIndices().size(idx) == 0)
0511         break;  //guard
0512       if (tracks_view[idx].quality() < pixelTrack::Quality::loose)
0513         continue;
0514       atomicAdd(&(counters->nLooseTracks), 1);
0515       if (tracks_view[idx].quality() < pixelTrack::Quality::strict)
0516         continue;
0517       atomicAdd(&(counters->nGoodTracks), 1);
0518     }
0519   }
0520 
0521   template <typename TrackerTraits>
0522   __global__ void kernel_countHitInTracks(TkSoAView<TrackerTraits> tracks_view, HitToTuple<TrackerTraits> *hitToTuple) {
0523     int first = blockDim.x * blockIdx.x + threadIdx.x;
0524     for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0525       if (tracks_view.hitIndices().size(idx) == 0)
0526         break;  // guard
0527       for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h)
0528         hitToTuple->count(*h);
0529     }
0530   }
0531 
0532   template <typename TrackerTraits>
0533   __global__ void kernel_fillHitInTracks(TkSoAView<TrackerTraits> tracks_view, HitToTuple<TrackerTraits> *hitToTuple) {
0534     int first = blockDim.x * blockIdx.x + threadIdx.x;
0535     for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0536       if (tracks_view.hitIndices().size(idx) == 0)
0537         break;  // guard
0538       for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h)
0539         hitToTuple->fill(*h, idx);
0540     }
0541   }
0542 
0543   template <typename TrackerTraits>
0544   __global__ void kernel_fillHitDetIndices(TkSoAView<TrackerTraits> tracks_view, HitsConstView<TrackerTraits> hh) {
0545     int first = blockDim.x * blockIdx.x + threadIdx.x;
0546     // copy offsets
0547     for (int idx = first, ntot = tracks_view.hitIndices().totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0548       tracks_view.detIndices().off[idx] = tracks_view.hitIndices().off[idx];
0549     }
0550     // fill hit indices
0551     auto nhits = hh.nHits();
0552 
0553     for (int idx = first, ntot = tracks_view.hitIndices().size(); idx < ntot; idx += gridDim.x * blockDim.x) {
0554       assert(tracks_view.hitIndices().content[idx] < nhits);
0555       tracks_view.detIndices().content[idx] = hh[tracks_view.hitIndices().content[idx]].detectorIndex();
0556     }
0557   }
0558 
0559   template <typename TrackerTraits>
0560   __global__ void kernel_fillNLayers(TkSoAView<TrackerTraits> tracks_view, cms::cuda::AtomicPairCounter *apc) {
0561     auto first = blockIdx.x * blockDim.x + threadIdx.x;
0562     // clamp the number of tracks to the capacity of the SoA
0563     auto ntracks = std::min<int>(apc->get().m, tracks_view.metadata().size() - 1);
0564     if (0 == first)
0565       tracks_view.nTracks() = ntracks;
0566     for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) {
0567       auto nHits = TracksUtilities<TrackerTraits>::nHits(tracks_view, idx);
0568       assert(nHits >= 3);
0569       tracks_view[idx].nLayers() = TracksUtilities<TrackerTraits>::computeNumberOfLayers(tracks_view, idx);
0570     }
0571   }
0572 
0573   template <typename TrackerTraits>
0574   __global__ void kernel_doStatsForHitInTracks(HitToTuple<TrackerTraits> const *__restrict__ hitToTuple,
0575                                                Counters *counters) {
0576     auto &c = *counters;
0577     int first = blockDim.x * blockIdx.x + threadIdx.x;
0578     for (int idx = first, ntot = hitToTuple->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0579       if (hitToTuple->size(idx) == 0)
0580         continue;  // SHALL NOT BE break
0581       atomicAdd(&c.nUsedHits, 1);
0582       if (hitToTuple->size(idx) > 1)
0583         atomicAdd(&c.nDupHits, 1);
0584     }
0585   }
0586 
0587   template <typename TrackerTraits>
0588   __global__ void kernel_countSharedHit(int *__restrict__ nshared,
0589                                         HitContainer<TrackerTraits> const *__restrict__ ptuples,
0590                                         Quality const *__restrict__ quality,
0591                                         HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) {
0592     constexpr auto loose = pixelTrack::Quality::loose;
0593 
0594     auto &hitToTuple = *phitToTuple;
0595     auto const &foundNtuplets = *ptuples;
0596 
0597     int first = blockDim.x * blockIdx.x + threadIdx.x;
0598     for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0599       if (hitToTuple.size(idx) < 2)
0600         continue;
0601 
0602       int nt = 0;
0603 
0604       // count "good" tracks
0605       for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
0606         if (quality[*it] < loose)
0607           continue;
0608         ++nt;
0609       }
0610 
0611       if (nt < 2)
0612         continue;
0613 
0614       // now mark  each track triplet as sharing a hit
0615       for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
0616         if (foundNtuplets.size(*it) > 3)
0617           continue;
0618         atomicAdd(&nshared[*it], 1);
0619       }
0620 
0621     }  //  hit loop
0622   }
0623 
0624   template <typename TrackerTraits>
0625   __global__ void kernel_markSharedHit(int const *__restrict__ nshared,
0626                                        HitContainer<TrackerTraits> const *__restrict__ tuples,
0627                                        Quality *__restrict__ quality,
0628                                        bool dupPassThrough) {
0629     // constexpr auto bad = pixelTrack::Quality::bad;
0630     constexpr auto dup = pixelTrack::Quality::dup;
0631     constexpr auto loose = pixelTrack::Quality::loose;
0632     // constexpr auto strict = pixelTrack::Quality::strict;
0633 
0634     // quality to mark rejected
0635     auto const reject = dupPassThrough ? loose : dup;
0636 
0637     int first = blockDim.x * blockIdx.x + threadIdx.x;
0638     for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0639       if (tuples->size(idx) == 0)
0640         break;  //guard
0641       if (quality[idx] <= reject)
0642         continue;
0643       if (nshared[idx] > 2)
0644         quality[idx] = reject;
0645     }
0646   }
0647 
0648   // mostly for very forward triplets.....
0649   template <typename TrackerTraits>
0650   __global__ void kernel_rejectDuplicate(TkSoAView<TrackerTraits> tracks_view,
0651                                          uint16_t nmin,
0652                                          bool dupPassThrough,
0653                                          HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) {
0654     // quality to mark rejected
0655     auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
0656 
0657     auto &hitToTuple = *phitToTuple;
0658 
0659     int first = blockDim.x * blockIdx.x + threadIdx.x;
0660     for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0661       if (hitToTuple.size(idx) < 2)
0662         continue;
0663 
0664       auto score = [&](auto it, auto nl) { return std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it)); };
0665 
0666       // full combinatorics
0667       for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) {
0668         auto const it = *ip;
0669         auto qi = tracks_view[it].quality();
0670         if (qi <= reject)
0671           continue;
0672         auto opi = tracks_view[it].state()(2);
0673         auto e2opi = tracks_view[it].covariance()(9);
0674         auto cti = tracks_view[it].state()(3);
0675         auto e2cti = tracks_view[it].covariance()(12);
0676         auto nli = tracks_view[it].nLayers();
0677         for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) {
0678           auto const jt = *jp;
0679           auto qj = tracks_view[jt].quality();
0680           if (qj <= reject)
0681             continue;
0682           auto opj = tracks_view[jt].state()(2);
0683           auto ctj = tracks_view[jt].state()(3);
0684           auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti);
0685           if ((cti - ctj) * (cti - ctj) > dct)
0686             continue;
0687           auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi);
0688           if ((opi - opj) * (opi - opj) > dop)
0689             continue;
0690           auto nlj = tracks_view[jt].nLayers();
0691           if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj)))))
0692             tracks_view[jt].quality() = reject;
0693           else {
0694             tracks_view[it].quality() = reject;
0695             break;
0696           }
0697         }
0698       }
0699     }
0700   }
0701 
0702   template <typename TrackerTraits>
0703   __global__ void kernel_sharedHitCleaner(HitsConstView<TrackerTraits> hh,
0704                                           TkSoAView<TrackerTraits> tracks_view,
0705                                           int nmin,
0706                                           bool dupPassThrough,
0707                                           HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) {
0708     // quality to mark rejected
0709     auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
0710     // quality of longest track
0711     auto const longTqual = pixelTrack::Quality::highPurity;
0712 
0713     auto &hitToTuple = *phitToTuple;
0714 
0715     int l1end = hh.hitsLayerStart()[1];
0716 
0717     int first = blockDim.x * blockIdx.x + threadIdx.x;
0718     for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0719       if (hitToTuple.size(idx) < 2)
0720         continue;
0721 
0722       int8_t maxNl = 0;
0723 
0724       // find maxNl
0725       for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
0726         if (tracks_view[*it].quality() < longTqual)
0727           continue;
0728         // if (tracks_view[*it].nHits()==3) continue;
0729         auto nl = tracks_view[*it].nLayers();
0730         maxNl = std::max(nl, maxNl);
0731       }
0732 
0733       if (maxNl < 4)
0734         continue;
0735 
0736       // quad pass through (leave for tests)
0737       // maxNl = std::min(4, maxNl);
0738 
0739       // kill all tracks shorter than maxHl (only triplets???
0740       for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
0741         auto nl = tracks_view[*it].nLayers();
0742 
0743         //checking if shared hit is on bpix1 and if the tuple is short enough
0744         if (idx < l1end and nl > nmin)
0745           continue;
0746 
0747         if (nl < maxNl && tracks_view[*it].quality() > reject)
0748           tracks_view[*it].quality() = reject;
0749       }
0750     }
0751   }
0752 
0753   template <typename TrackerTraits>
0754   __global__ void kernel_tripletCleaner(TkSoAView<TrackerTraits> tracks_view,
0755                                         uint16_t nmin,
0756                                         bool dupPassThrough,
0757                                         HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) {
0758     // quality to mark rejected
0759     auto const reject = pixelTrack::Quality::loose;
0760     /// min quality of good
0761     auto const good = pixelTrack::Quality::strict;
0762 
0763     auto &hitToTuple = *phitToTuple;
0764 
0765     int first = blockDim.x * blockIdx.x + threadIdx.x;
0766     for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0767       if (hitToTuple.size(idx) < 2)
0768         continue;
0769 
0770       float mc = maxScore;
0771       uint16_t im = tkNotFound;
0772       bool onlyTriplets = true;
0773 
0774       // check if only triplets
0775       for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
0776         if (tracks_view[*it].quality() <= good)
0777           continue;
0778         onlyTriplets &= TracksUtilities<TrackerTraits>::isTriplet(tracks_view, *it);
0779         if (!onlyTriplets)
0780           break;
0781       }
0782 
0783       // only triplets
0784       if (!onlyTriplets)
0785         continue;
0786 
0787       // for triplets choose best tip!  (should we first find best quality???)
0788       for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
0789         auto const it = *ip;
0790         if (tracks_view[it].quality() >= good && std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it)) < mc) {
0791           mc = std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it));
0792           im = it;
0793         }
0794       }
0795 
0796       if (tkNotFound == im)
0797         continue;
0798 
0799       // mark worse ambiguities
0800       for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
0801         auto const it = *ip;
0802         if (tracks_view[it].quality() > reject && it != im)
0803           tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
0804       }
0805 
0806     }  // loop over hits
0807   }
0808 
0809   template <typename TrackerTraits>
0810   __global__ void kernel_simpleTripletCleaner(TkSoAView<TrackerTraits> tracks_view,
0811                                               uint16_t nmin,
0812                                               bool dupPassThrough,
0813                                               HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) {
0814     // quality to mark rejected
0815     auto const reject = pixelTrack::Quality::loose;
0816     /// min quality of good
0817     auto const good = pixelTrack::Quality::loose;
0818 
0819     auto &hitToTuple = *phitToTuple;
0820 
0821     int first = blockDim.x * blockIdx.x + threadIdx.x;
0822     for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0823       if (hitToTuple.size(idx) < 2)
0824         continue;
0825 
0826       float mc = maxScore;
0827       uint16_t im = tkNotFound;
0828 
0829       // choose best tip!  (should we first find best quality???)
0830       for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
0831         auto const it = *ip;
0832         if (tracks_view[it].quality() >= good && std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it)) < mc) {
0833           mc = std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it));
0834           im = it;
0835         }
0836       }
0837 
0838       if (tkNotFound == im)
0839         continue;
0840 
0841       // mark worse ambiguities
0842       for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
0843         auto const it = *ip;
0844         if (tracks_view[it].quality() > reject && TracksUtilities<TrackerTraits>::isTriplet(tracks_view, it) &&
0845             it != im)
0846           tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
0847       }
0848 
0849     }  // loop over hits
0850   }
0851 
0852   template <typename TrackerTraits>
0853   __global__ void kernel_print_found_ntuplets(HitsConstView<TrackerTraits> hh,
0854                                               TkSoAView<TrackerTraits> tracks_view,
0855                                               HitToTuple<TrackerTraits> const *__restrict__ phitToTuple,
0856                                               int32_t firstPrint,
0857                                               int32_t lastPrint,
0858                                               int iev) {
0859     constexpr auto loose = pixelTrack::Quality::loose;
0860 
0861     int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x;
0862     for (int i = first, np = std::min(lastPrint, tracks_view.hitIndices().nOnes()); i < np;
0863          i += blockDim.x * gridDim.x) {
0864       auto nh = tracks_view.hitIndices().size(i);
0865       if (nh < 3)
0866         continue;
0867       if (tracks_view[i].quality() < loose)
0868         continue;
0869       printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n",
0870              10000 * iev + i,
0871              int(tracks_view[i].quality()),
0872              nh,
0873              tracks_view[i].nLayers(),
0874              TracksUtilities<TrackerTraits>::charge(tracks_view, i),
0875              tracks_view[i].pt(),
0876              tracks_view[i].eta(),
0877              TracksUtilities<TrackerTraits>::phi(tracks_view, i),
0878              TracksUtilities<TrackerTraits>::tip(tracks_view, i),
0879              TracksUtilities<TrackerTraits>::zip(tracks_view, i),
0880              tracks_view[i].chi2(),
0881              hh[*tracks_view.hitIndices().begin(i)].zGlobal(),
0882              hh[*(tracks_view.hitIndices().begin(i) + 1)].zGlobal(),
0883              hh[*(tracks_view.hitIndices().begin(i) + 2)].zGlobal(),
0884              nh > 3 ? hh[int(*(tracks_view.hitIndices().begin(i) + 3))].zGlobal() : 0,
0885              nh > 4 ? hh[int(*(tracks_view.hitIndices().begin(i) + 4))].zGlobal() : 0,
0886              nh > 5 ? hh[int(*(tracks_view.hitIndices().begin(i) + 5))].zGlobal() : 0,
0887              nh > 6 ? hh[int(*(tracks_view.hitIndices().begin(i) + nh - 1))].zGlobal() : 0);
0888     }
0889   }
0890 
0891   __global__ void kernel_printCounters(Counters const *counters) {
0892     auto const &c = *counters;
0893     printf(
0894         "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks  |  nLooseTracks  |  nGoodTracks | nUsedHits | "
0895         "nDupHits | "
0896         "nFishCells | "
0897         "nKilledCells | "
0898         "nUsedCells | nZeroTrackCells ||\n");
0899     printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
0900            c.nEvents,
0901            c.nHits,
0902            c.nCells,
0903            c.nTuples,
0904            c.nFitTracks,
0905            c.nLooseTracks,
0906            c.nGoodTracks,
0907            c.nUsedHits,
0908            c.nDupHits,
0909            c.nFishCells,
0910            c.nKilledCells,
0911            c.nEmptyCells,
0912            c.nZeroTrackCells);
0913     printf(
0914         "Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f|  %.3f|  %.3f||\n",
0915         c.nEvents,
0916         c.nHits / double(c.nEvents),
0917         c.nCells / double(c.nEvents),
0918         c.nTuples / double(c.nEvents),
0919         c.nFitTracks / double(c.nEvents),
0920         c.nLooseTracks / double(c.nEvents),
0921         c.nGoodTracks / double(c.nEvents),
0922         c.nUsedHits / double(c.nEvents),
0923         c.nDupHits / double(c.nEvents),
0924         c.nFishCells / double(c.nCells),
0925         c.nKilledCells / double(c.nCells),
0926         c.nEmptyCells / double(c.nCells),
0927         c.nZeroTrackCells / double(c.nCells));
0928   }
0929 
0930 }  // namespace caHitNtupletGeneratorKernels