Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2022-04-04 00:15:51

0001 //
0002 // Original Author: Felice Pantaleo, CERN
0003 //
0004 
0005 // #define NTUPLE_DEBUG
0006 // #define GPU_DEBUG
0007 
0008 #include <cmath>
0009 #include <cstdint>
0010 #include <limits>
0011 
0012 #include <cuda_runtime.h>
0013 
0014 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0015 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0016 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
0017 
0018 #include "CAConstants.h"
0019 #include "CAHitNtupletGeneratorKernels.h"
0020 #include "GPUCACell.h"
0021 #include "gpuFishbone.h"
0022 #include "gpuPixelDoublets.h"
0023 
0024 using HitsOnGPU = TrackingRecHit2DSOAView;
0025 using HitsOnCPU = TrackingRecHit2DGPU;
0026 
0027 using HitToTuple = caConstants::HitToTuple;
0028 using TupleMultiplicity = caConstants::TupleMultiplicity;
0029 
0030 using Quality = pixelTrack::Quality;
0031 using TkSoA = pixelTrack::TrackSoA;
0032 using HitContainer = pixelTrack::HitContainer;
0033 
0034 namespace {
0035 
0036   constexpr uint16_t tkNotFound = std::numeric_limits<uint16_t>::max();
0037   constexpr float maxScore = std::numeric_limits<float>::max();
0038   constexpr float nSigma2 = 25.f;
0039 
0040 }  // namespace
0041 
0042 __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
0043                                       caConstants::TupleMultiplicity const *tupleMultiplicity,
0044                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple const *hitToTuple,
0045                                       cms::cuda::AtomicPairCounter *apc,
0046                                       GPUCACell const *__restrict__ cells,
0047                                       uint32_t const *__restrict__ nCells,
0048                                       gpuPixelDoublets::CellNeighborsVector const *cellNeighbors,
0049                                       gpuPixelDoublets::CellTracksVector const *cellTracks,
0050                                       GPUCACell::OuterHitOfCell const isOuterHitOfCell,
0051                                       int32_t nHits,
0052                                       uint32_t maxNumberOfDoublets,
0053                                       CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
0054   auto first = threadIdx.x + blockIdx.x * blockDim.x;
0055 
0056   auto &c = *counters;
0057   // counters once per event
0058   if (0 == first) {
0059     atomicAdd(&c.nEvents, 1);
0060     atomicAdd(&c.nHits, nHits);
0061     atomicAdd(&c.nCells, *nCells);
0062     atomicAdd(&c.nTuples, apc->get().m);
0063     atomicAdd(&c.nFitTracks, tupleMultiplicity->size());
0064   }
0065 
0066 #ifdef NTUPLE_DEBUG
0067   if (0 == first) {
0068     printf("number of found cells %d, found tuples %d with total hits %d out of %d %d\n",
0069            *nCells,
0070            apc->get().m,
0071            apc->get().n,
0072            nHits,
0073            hitToTuple->totOnes());
0074     if (apc->get().m < caConstants::maxNumberOfQuadruplets) {
0075       assert(foundNtuplets->size(apc->get().m) == 0);
0076       assert(foundNtuplets->size() == apc->get().n);
0077     }
0078   }
0079 
0080   for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) {
0081     if (foundNtuplets->size(idx) > 7)  // current real limit
0082       printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
0083     assert(foundNtuplets->size(idx) <= caConstants::maxHitsOnTrack);
0084     for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
0085       assert(int(*ih) < nHits);
0086   }
0087 #endif
0088 
0089   if (0 == first) {
0090     if (apc->get().m >= caConstants::maxNumberOfQuadruplets)
0091       printf("Tuples overflow\n");
0092     if (*nCells >= maxNumberOfDoublets)
0093       printf("Cells overflow\n");
0094     if (cellNeighbors && cellNeighbors->full())
0095       printf("cellNeighbors overflow\n");
0096     if (cellTracks && cellTracks->full())
0097       printf("cellTracks overflow\n");
0098     if (int(hitToTuple->nOnes()) < nHits)
0099       printf("ERROR hitToTuple  overflow %d %d\n", hitToTuple->nOnes(), nHits);
0100   }
0101 
0102   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0103     auto const &thisCell = cells[idx];
0104     if (thisCell.hasFishbone() && !thisCell.isKilled())
0105       atomicAdd(&c.nFishCells, 1);
0106     if (thisCell.outerNeighbors().full())  //++tooManyNeighbors[thisCell.theLayerPairId];
0107       printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId());
0108     if (thisCell.tracks().full())  //++tooManyTracks[thisCell.theLayerPairId];
0109       printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId());
0110     if (thisCell.isKilled())
0111       atomicAdd(&c.nKilledCells, 1);
0112     if (!thisCell.unused())
0113       atomicAdd(&c.nEmptyCells, 1);
0114     if ((0 == hitToTuple->size(thisCell.inner_hit_id())) && (0 == hitToTuple->size(thisCell.outer_hit_id())))
0115       atomicAdd(&c.nZeroTrackCells, 1);
0116   }
0117 
0118   for (int idx = first, nt = nHits - isOuterHitOfCell.offset; idx < nt; idx += gridDim.x * blockDim.x) {
0119     if (isOuterHitOfCell.container[idx].full())  // ++tooManyOuterHitOfCell;
0120       printf("OuterHitOfCell overflow %d\n", idx);
0121   }
0122 }
0123 
0124 __global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *__restrict__ nCells, Quality *quality) {
0125   constexpr auto reject = pixelTrack::Quality::dup;
0126 
0127   auto first = threadIdx.x + blockIdx.x * blockDim.x;
0128   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0129     auto const &thisCell = cells[idx];
0130     if (!thisCell.isKilled())
0131       continue;
0132 
0133     for (auto it : thisCell.tracks())
0134       quality[it] = reject;
0135   }
0136 }
0137 
0138 // remove shorter tracks if sharing a cell
0139 // It does not seem to affect efficiency in any way!
0140 __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
0141                                              uint32_t const *__restrict__ nCells,
0142                                              TkSoA const *__restrict__ ptracks,
0143                                              Quality *quality,
0144                                              bool dupPassThrough) {
0145   // quality to mark rejected
0146   constexpr auto reject = pixelTrack::Quality::edup;  /// cannot be loose
0147 
0148   auto const &tracks = *ptracks;
0149 
0150   assert(nCells);
0151   auto first = threadIdx.x + blockIdx.x * blockDim.x;
0152   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0153     auto const &thisCell = cells[idx];
0154 
0155     if (thisCell.tracks().size() < 2)
0156       continue;
0157 
0158     int8_t maxNl = 0;
0159 
0160     // find maxNl
0161     for (auto it : thisCell.tracks()) {
0162       auto nl = tracks.nLayers(it);
0163       maxNl = std::max(nl, maxNl);
0164     }
0165 
0166     // if (maxNl<4) continue;
0167     // quad pass through (leave it her for tests)
0168     //  maxNl = std::min(4, maxNl);
0169 
0170     for (auto it : thisCell.tracks()) {
0171       if (tracks.nLayers(it) < maxNl)
0172         quality[it] = reject;  //no race:  simple assignment of the same constant
0173     }
0174   }
0175 }
0176 
0177 // assume the above (so, short tracks already removed)
0178 __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
0179                                             uint32_t const *__restrict__ nCells,
0180                                             TkSoA *__restrict__ tracks,
0181                                             bool dupPassThrough) {
0182   // quality to mark rejected
0183   auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
0184   constexpr auto loose = pixelTrack::Quality::loose;
0185 
0186   assert(nCells);
0187 
0188   auto first = threadIdx.x + blockIdx.x * blockDim.x;
0189   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0190     auto const &thisCell = cells[idx];
0191     if (thisCell.tracks().size() < 2)
0192       continue;
0193 
0194     float mc = maxScore;
0195     uint16_t im = tkNotFound;
0196 
0197     /* chi2 penalize higher-pt tracks  (try rescale it?)
0198     auto score = [&](auto it) {
0199       return tracks->nLayers(it) < 4 ? 
0200               std::abs(tracks->tip(it)) :  // tip for triplets
0201               tracks->chi2(it);            //chi2 for quads
0202     };
0203     */
0204 
0205     auto score = [&](auto it) { return std::abs(tracks->tip(it)); };
0206 
0207     // full crazy combinatorics
0208     int ntr = thisCell.tracks().size();
0209     for (int i = 0; i < ntr - 1; ++i) {
0210       auto it = thisCell.tracks()[i];
0211       auto qi = tracks->quality(it);
0212       if (qi <= reject)
0213         continue;
0214       auto opi = tracks->stateAtBS.state(it)(2);
0215       auto e2opi = tracks->stateAtBS.covariance(it)(9);
0216       auto cti = tracks->stateAtBS.state(it)(3);
0217       auto e2cti = tracks->stateAtBS.covariance(it)(12);
0218       for (auto j = i + 1; j < ntr; ++j) {
0219         auto jt = thisCell.tracks()[j];
0220         auto qj = tracks->quality(jt);
0221         if (qj <= reject)
0222           continue;
0223 #ifdef GPU_DEBUG
0224         if (foundNtuplets->size(it) != foundNtuplets->size(jt))
0225           printf(" a mess\n");
0226 #endif
0227         auto opj = tracks->stateAtBS.state(jt)(2);
0228         auto ctj = tracks->stateAtBS.state(jt)(3);
0229         auto dct = nSigma2 * (tracks->stateAtBS.covariance(jt)(12) + e2cti);
0230         if ((cti - ctj) * (cti - ctj) > dct)
0231           continue;
0232         auto dop = nSigma2 * (tracks->stateAtBS.covariance(jt)(9) + e2opi);
0233         if ((opi - opj) * (opi - opj) > dop)
0234           continue;
0235         if ((qj < qi) || (qj == qi && score(it) < score(jt)))
0236           tracks->quality(jt) = reject;
0237         else {
0238           tracks->quality(it) = reject;
0239           break;
0240         }
0241       }
0242     }
0243 
0244     // find maxQual
0245     auto maxQual = reject;  // no duplicate!
0246     for (auto it : thisCell.tracks()) {
0247       if (tracks->quality(it) > maxQual)
0248         maxQual = tracks->quality(it);
0249     }
0250 
0251     if (maxQual <= loose)
0252       continue;
0253 
0254     // find min score
0255     for (auto it : thisCell.tracks()) {
0256       if (tracks->quality(it) == maxQual && score(it) < mc) {
0257         mc = score(it);
0258         im = it;
0259       }
0260     }
0261 
0262     if (tkNotFound == im)
0263       continue;
0264 
0265     // mark all other duplicates  (not yet, keep it loose)
0266     for (auto it : thisCell.tracks()) {
0267       if (tracks->quality(it) > loose && it != im)
0268         tracks->quality(it) = loose;  //no race:  simple assignment of the same constant
0269     }
0270   }
0271 }
0272 
0273 __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
0274                                cms::cuda::AtomicPairCounter *apc2,  // just to zero them,
0275                                GPUCACell::Hits const *__restrict__ hhp,
0276                                GPUCACell *cells,
0277                                uint32_t const *__restrict__ nCells,
0278                                gpuPixelDoublets::CellNeighborsVector *cellNeighbors,
0279                                GPUCACell::OuterHitOfCell const isOuterHitOfCell,
0280                                float hardCurvCut,
0281                                float ptmin,
0282                                float CAThetaCutBarrel,
0283                                float CAThetaCutForward,
0284                                float dcaCutInnerTriplet,
0285                                float dcaCutOuterTriplet) {
0286   auto const &hh = *hhp;
0287 
0288   auto firstCellIndex = threadIdx.y + blockIdx.y * blockDim.y;
0289   auto first = threadIdx.x;
0290   auto stride = blockDim.x;
0291 
0292   if (0 == (firstCellIndex + first)) {
0293     (*apc1) = 0;
0294     (*apc2) = 0;
0295   }  // ready for next kernel
0296 
0297   for (int idx = firstCellIndex, nt = (*nCells); idx < nt; idx += gridDim.y * blockDim.y) {
0298     auto cellIndex = idx;
0299     auto &thisCell = cells[idx];
0300     auto innerHitId = thisCell.inner_hit_id();
0301     if (int(innerHitId) < isOuterHitOfCell.offset)
0302       continue;
0303     int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
0304     auto vi = isOuterHitOfCell[innerHitId].data();
0305 
0306     auto ri = thisCell.inner_r(hh);
0307     auto zi = thisCell.inner_z(hh);
0308 
0309     auto ro = thisCell.outer_r(hh);
0310     auto zo = thisCell.outer_z(hh);
0311     auto isBarrel = thisCell.inner_detIndex(hh) < caConstants::last_barrel_detIndex;
0312 
0313     for (int j = first; j < numberOfPossibleNeighbors; j += stride) {
0314       auto otherCell = __ldg(vi + j);
0315       auto &oc = cells[otherCell];
0316       auto r1 = oc.inner_r(hh);
0317       auto z1 = oc.inner_z(hh);
0318       bool aligned = GPUCACell::areAlignedRZ(
0319           r1,
0320           z1,
0321           ri,
0322           zi,
0323           ro,
0324           zo,
0325           ptmin,
0326           isBarrel ? CAThetaCutBarrel : CAThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
0327       if (aligned && thisCell.dcaCut(hh,
0328                                      oc,
0329                                      oc.inner_detIndex(hh) < caConstants::last_bpix1_detIndex ? dcaCutInnerTriplet
0330                                                                                               : dcaCutOuterTriplet,
0331                                      hardCurvCut)) {  // FIXME tune cuts
0332         oc.addOuterNeighbor(cellIndex, *cellNeighbors);
0333         thisCell.setStatusBits(GPUCACell::StatusBit::kUsed);
0334         oc.setStatusBits(GPUCACell::StatusBit::kUsed);
0335       }
0336     }  // loop on inner cells
0337   }    // loop on outer cells
0338 }
0339 
0340 __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
0341                                      GPUCACell *__restrict__ cells,
0342                                      uint32_t const *nCells,
0343                                      gpuPixelDoublets::CellTracksVector *cellTracks,
0344                                      HitContainer *foundNtuplets,
0345                                      cms::cuda::AtomicPairCounter *apc,
0346                                      Quality *__restrict__ quality,
0347                                      unsigned int minHitsPerNtuplet) {
0348   // recursive: not obvious to widen
0349   auto const &hh = *hhp;
0350 
0351   auto first = threadIdx.x + blockIdx.x * blockDim.x;
0352   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0353     auto const &thisCell = cells[idx];
0354     if (thisCell.isKilled())
0355       continue;  // cut by earlyFishbone
0356     // we require at least three hits...
0357     if (thisCell.outerNeighbors().empty())
0358       continue;
0359     auto pid = thisCell.layerPairId();
0360     auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12;
0361     if (doit) {
0362       GPUCACell::TmpTuple stack;
0363       stack.reset();
0364       thisCell.find_ntuplets<6>(
0365           hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
0366       assert(stack.empty());
0367       // printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
0368     }
0369   }
0370 }
0371 
0372 __global__ void kernel_mark_used(GPUCACell *__restrict__ cells, uint32_t const *nCells) {
0373   auto first = threadIdx.x + blockIdx.x * blockDim.x;
0374   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0375     auto &thisCell = cells[idx];
0376     if (!thisCell.tracks().empty())
0377       thisCell.setStatusBits(GPUCACell::StatusBit::kInTrack);
0378   }
0379 }
0380 
0381 __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
0382                                          Quality const *__restrict__ quality,
0383                                          caConstants::TupleMultiplicity *tupleMultiplicity) {
0384   auto first = blockIdx.x * blockDim.x + threadIdx.x;
0385   for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
0386     auto nhits = foundNtuplets->size(it);
0387     if (nhits < 3)
0388       continue;
0389     if (quality[it] == pixelTrack::Quality::edup)
0390       continue;
0391     assert(quality[it] == pixelTrack::Quality::bad);
0392     if (nhits > 7)  // current limit
0393       printf("wrong mult %d %d\n", it, nhits);
0394     assert(nhits <= caConstants::maxHitsOnTrack);
0395     tupleMultiplicity->count(nhits);
0396   }
0397 }
0398 
0399 __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
0400                                         Quality const *__restrict__ quality,
0401                                         caConstants::TupleMultiplicity *tupleMultiplicity) {
0402   auto first = blockIdx.x * blockDim.x + threadIdx.x;
0403   for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
0404     auto nhits = foundNtuplets->size(it);
0405     if (nhits < 3)
0406       continue;
0407     if (quality[it] == pixelTrack::Quality::edup)
0408       continue;
0409     assert(quality[it] == pixelTrack::Quality::bad);
0410     if (nhits > 7)
0411       printf("wrong mult %d %d\n", it, nhits);
0412     assert(nhits <= caConstants::maxHitsOnTrack);
0413     tupleMultiplicity->fill(nhits, it);
0414   }
0415 }
0416 
0417 __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
0418                                       TkSoA const *__restrict__ tracks,
0419                                       CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts,
0420                                       Quality *__restrict__ quality) {
0421   int first = blockDim.x * blockIdx.x + threadIdx.x;
0422   for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
0423     auto nhits = tuples->size(it);
0424     if (nhits == 0)
0425       break;  // guard
0426 
0427     // if duplicate: not even fit
0428     if (quality[it] == pixelTrack::Quality::edup)
0429       continue;
0430 
0431     assert(quality[it] == pixelTrack::Quality::bad);
0432 
0433     // mark doublets as bad
0434     if (nhits < 3)
0435       continue;
0436 
0437     // if the fit has any invalid parameters, mark it as bad
0438     bool isNaN = false;
0439     for (int i = 0; i < 5; ++i) {
0440       isNaN |= std::isnan(tracks->stateAtBS.state(it)(i));
0441     }
0442     if (isNaN) {
0443 #ifdef NTUPLE_DEBUG
0444       printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it));
0445 #endif
0446       continue;
0447     }
0448 
0449     quality[it] = pixelTrack::Quality::strict;
0450 
0451     // compute a pT-dependent chi2 cut
0452 
0453     auto roughLog = [](float x) {
0454       // max diff [0.5,12] at 1.25 0.16143
0455       // average diff  0.0662998
0456       union IF {
0457         uint32_t i;
0458         float f;
0459       };
0460       IF z;
0461       z.f = x;
0462       uint32_t lsb = 1 < 21;
0463       z.i += lsb;
0464       z.i >>= 21;
0465       auto f = z.i & 3;
0466       int ex = int(z.i >> 2) - 127;
0467 
0468       // log2(1+0.25*f)
0469       // averaged over bins
0470       const float frac[4] = {0.160497f, 0.452172f, 0.694562f, 0.901964f};
0471       return float(ex) + frac[f];
0472     };
0473 
0474     // (see CAHitNtupletGeneratorGPU.cc)
0475     float pt = std::min<float>(tracks->pt(it), cuts.chi2MaxPt);
0476     float chi2Cut = cuts.chi2Scale * (cuts.chi2Coeff[0] + roughLog(pt) * cuts.chi2Coeff[1]);
0477     if (tracks->chi2(it) >= chi2Cut) {
0478 #ifdef NTUPLE_FIT_DEBUG
0479       printf("Bad chi2 %d size %d pt %f eta %f chi2 %f\n",
0480              it,
0481              tuples->size(it),
0482              tracks->pt(it),
0483              tracks->eta(it),
0484              tracks->chi2(it));
0485 #endif
0486       continue;
0487     }
0488 
0489     quality[it] = pixelTrack::Quality::tight;
0490 
0491     // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip)
0492     // default cuts:
0493     //   - for triplets:    |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm
0494     //   - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm
0495     // (see CAHitNtupletGeneratorGPU.cc)
0496     auto const &region = (nhits > 3) ? cuts.quadruplet : cuts.triplet;
0497     bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and
0498                 (std::abs(tracks->zip(it)) < region.maxZip);
0499 
0500     if (isOk)
0501       quality[it] = pixelTrack::Quality::highPurity;
0502   }
0503 }
0504 
0505 __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
0506                                         Quality const *__restrict__ quality,
0507                                         CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
0508   int first = blockDim.x * blockIdx.x + threadIdx.x;
0509   for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0510     if (tuples->size(idx) == 0)
0511       break;  //guard
0512     if (quality[idx] < pixelTrack::Quality::loose)
0513       continue;
0514     atomicAdd(&(counters->nLooseTracks), 1);
0515     if (quality[idx] < pixelTrack::Quality::strict)
0516       continue;
0517     atomicAdd(&(counters->nGoodTracks), 1);
0518   }
0519 }
0520 
0521 __global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
0522                                         Quality const *__restrict__ quality,
0523                                         CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
0524   int first = blockDim.x * blockIdx.x + threadIdx.x;
0525   for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0526     if (tuples->size(idx) == 0)
0527       break;  // guard
0528     for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
0529       hitToTuple->count(*h);
0530   }
0531 }
0532 
0533 __global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
0534                                        Quality const *__restrict__ quality,
0535                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
0536   int first = blockDim.x * blockIdx.x + threadIdx.x;
0537   for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0538     if (tuples->size(idx) == 0)
0539       break;  // guard
0540     for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
0541       hitToTuple->fill(*h, idx);
0542   }
0543 }
0544 
0545 __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples,
0546                                          TrackingRecHit2DSOAView const *__restrict__ hhp,
0547                                          HitContainer *__restrict__ hitDetIndices) {
0548   int first = blockDim.x * blockIdx.x + threadIdx.x;
0549   // copy offsets
0550   for (int idx = first, ntot = tuples->totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0551     hitDetIndices->off[idx] = tuples->off[idx];
0552   }
0553   // fill hit indices
0554   auto const &hh = *hhp;
0555   auto nhits = hh.nHits();
0556   for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) {
0557     assert(tuples->content[idx] < nhits);
0558     hitDetIndices->content[idx] = hh.detectorIndex(tuples->content[idx]);
0559   }
0560 }
0561 
0562 __global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, cms::cuda::AtomicPairCounter *apc) {
0563   auto &tracks = *ptracks;
0564   auto first = blockIdx.x * blockDim.x + threadIdx.x;
0565   auto ntracks = apc->get().m;
0566   if (0 == first)
0567     tracks.setNTracks(ntracks);
0568   for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) {
0569     auto nHits = tracks.nHits(idx);
0570     assert(nHits >= 3);
0571     tracks.nLayers(idx) = tracks.computeNumberOfLayers(idx);
0572   }
0573 }
0574 
0575 __global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ hitToTuple,
0576                                              CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
0577   auto &c = *counters;
0578   int first = blockDim.x * blockIdx.x + threadIdx.x;
0579   for (int idx = first, ntot = hitToTuple->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0580     if (hitToTuple->size(idx) == 0)
0581       continue;  // SHALL NOT BE break
0582     atomicAdd(&c.nUsedHits, 1);
0583     if (hitToTuple->size(idx) > 1)
0584       atomicAdd(&c.nDupHits, 1);
0585   }
0586 }
0587 
0588 __global__ void kernel_countSharedHit(int *__restrict__ nshared,
0589                                       HitContainer const *__restrict__ ptuples,
0590                                       Quality const *__restrict__ quality,
0591                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
0592   constexpr auto loose = pixelTrack::Quality::loose;
0593 
0594   auto &hitToTuple = *phitToTuple;
0595   auto const &foundNtuplets = *ptuples;
0596 
0597   int first = blockDim.x * blockIdx.x + threadIdx.x;
0598   for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0599     if (hitToTuple.size(idx) < 2)
0600       continue;
0601 
0602     int nt = 0;
0603 
0604     // count "good" tracks
0605     for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
0606       if (quality[*it] < loose)
0607         continue;
0608       ++nt;
0609     }
0610 
0611     if (nt < 2)
0612       continue;
0613 
0614     // now mark  each track triplet as sharing a hit
0615     for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
0616       if (foundNtuplets.size(*it) > 3)
0617         continue;
0618       atomicAdd(&nshared[*it], 1);
0619     }
0620 
0621   }  //  hit loop
0622 }
0623 
0624 __global__ void kernel_markSharedHit(int const *__restrict__ nshared,
0625                                      HitContainer const *__restrict__ tuples,
0626                                      Quality *__restrict__ quality,
0627                                      bool dupPassThrough) {
0628   // constexpr auto bad = pixelTrack::Quality::bad;
0629   constexpr auto dup = pixelTrack::Quality::dup;
0630   constexpr auto loose = pixelTrack::Quality::loose;
0631   // constexpr auto strict = pixelTrack::Quality::strict;
0632 
0633   // quality to mark rejected
0634   auto const reject = dupPassThrough ? loose : dup;
0635 
0636   int first = blockDim.x * blockIdx.x + threadIdx.x;
0637   for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0638     if (tuples->size(idx) == 0)
0639       break;  //guard
0640     if (quality[idx] <= reject)
0641       continue;
0642     if (nshared[idx] > 2)
0643       quality[idx] = reject;
0644   }
0645 }
0646 
0647 // mostly for very forward triplets.....
0648 __global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks,
0649                                        Quality *__restrict__ quality,
0650                                        uint16_t nmin,
0651                                        bool dupPassThrough,
0652                                        CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
0653   // quality to mark rejected
0654   auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
0655 
0656   auto &hitToTuple = *phitToTuple;
0657   auto const &tracks = *ptracks;
0658 
0659   int first = blockDim.x * blockIdx.x + threadIdx.x;
0660   for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0661     if (hitToTuple.size(idx) < 2)
0662       continue;
0663 
0664     /* chi2 is bad for large pt
0665     auto score = [&](auto it, auto nl) {
0666       return nl < 4 ? std::abs(tracks.tip(it)) :  // tip for triplets
0667                  tracks.chi2(it);                 //chi2
0668     };
0669     */
0670     auto score = [&](auto it, auto nl) { return std::abs(tracks.tip(it)); };
0671 
0672     // full combinatorics
0673     for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) {
0674       auto const it = *ip;
0675       auto qi = quality[it];
0676       if (qi <= reject)
0677         continue;
0678       auto opi = tracks.stateAtBS.state(it)(2);
0679       auto e2opi = tracks.stateAtBS.covariance(it)(9);
0680       auto cti = tracks.stateAtBS.state(it)(3);
0681       auto e2cti = tracks.stateAtBS.covariance(it)(12);
0682       auto nli = tracks.nLayers(it);
0683       for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) {
0684         auto const jt = *jp;
0685         auto qj = quality[jt];
0686         if (qj <= reject)
0687           continue;
0688         auto opj = tracks.stateAtBS.state(jt)(2);
0689         auto ctj = tracks.stateAtBS.state(jt)(3);
0690         auto dct = nSigma2 * (tracks.stateAtBS.covariance(jt)(12) + e2cti);
0691         if ((cti - ctj) * (cti - ctj) > dct)
0692           continue;
0693         auto dop = nSigma2 * (tracks.stateAtBS.covariance(jt)(9) + e2opi);
0694         if ((opi - opj) * (opi - opj) > dop)
0695           continue;
0696         auto nlj = tracks.nLayers(jt);
0697         if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj)))))
0698           quality[jt] = reject;
0699         else {
0700           quality[it] = reject;
0701           break;
0702         }
0703       }
0704     }
0705   }
0706 }
0707 
0708 __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp,
0709                                         TkSoA const *__restrict__ ptracks,
0710                                         Quality *__restrict__ quality,
0711                                         int nmin,
0712                                         bool dupPassThrough,
0713                                         CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
0714   // quality to mark rejected
0715   auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
0716   // quality of longest track
0717   auto const longTqual = pixelTrack::Quality::highPurity;
0718 
0719   auto &hitToTuple = *phitToTuple;
0720   auto const &tracks = *ptracks;
0721 
0722   auto const &hh = *hhp;
0723   int l1end = hh.hitsLayerStart()[1];
0724 
0725   int first = blockDim.x * blockIdx.x + threadIdx.x;
0726   for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0727     if (hitToTuple.size(idx) < 2)
0728       continue;
0729 
0730     int8_t maxNl = 0;
0731 
0732     // find maxNl
0733     for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
0734       if (quality[*it] < longTqual)
0735         continue;
0736       // if (tracks.nHits(*it)==3) continue;
0737       auto nl = tracks.nLayers(*it);
0738       maxNl = std::max(nl, maxNl);
0739     }
0740 
0741     if (maxNl < 4)
0742       continue;
0743 
0744     // quad pass through (leave for tests)
0745     // maxNl = std::min(4, maxNl);
0746 
0747     // kill all tracks shorter than maxHl (only triplets???
0748     for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
0749       auto nl = tracks.nLayers(*it);
0750 
0751       //checking if shared hit is on bpix1 and if the tuple is short enough
0752       if (idx < l1end and nl > nmin)
0753         continue;
0754 
0755       if (nl < maxNl && quality[*it] > reject)
0756         quality[*it] = reject;
0757     }
0758   }
0759 }
0760 
0761 __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks,
0762                                       Quality *__restrict__ quality,
0763                                       uint16_t nmin,
0764                                       bool dupPassThrough,
0765                                       CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
0766   // quality to mark rejected
0767   auto const reject = pixelTrack::Quality::loose;
0768   /// min quality of good
0769   auto const good = pixelTrack::Quality::strict;
0770 
0771   auto &hitToTuple = *phitToTuple;
0772   auto const &tracks = *ptracks;
0773 
0774   int first = blockDim.x * blockIdx.x + threadIdx.x;
0775   for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0776     if (hitToTuple.size(idx) < 2)
0777       continue;
0778 
0779     float mc = maxScore;
0780     uint16_t im = tkNotFound;
0781     bool onlyTriplets = true;
0782 
0783     // check if only triplets
0784     for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
0785       if (quality[*it] <= good)
0786         continue;
0787       onlyTriplets &= tracks.isTriplet(*it);
0788       if (!onlyTriplets)
0789         break;
0790     }
0791 
0792     // only triplets
0793     if (!onlyTriplets)
0794       continue;
0795 
0796     // for triplets choose best tip!  (should we first find best quality???)
0797     for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
0798       auto const it = *ip;
0799       if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) {
0800         mc = std::abs(tracks.tip(it));
0801         im = it;
0802       }
0803     }
0804 
0805     if (tkNotFound == im)
0806       continue;
0807 
0808     // mark worse ambiguities
0809     for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
0810       auto const it = *ip;
0811       if (quality[it] > reject && it != im)
0812         quality[it] = reject;  //no race:  simple assignment of the same constant
0813     }
0814 
0815   }  // loop over hits
0816 }
0817 
0818 __global__ void kernel_simpleTripletCleaner(
0819     TkSoA const *__restrict__ ptracks,
0820     Quality *__restrict__ quality,
0821     uint16_t nmin,
0822     bool dupPassThrough,
0823     CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
0824   // quality to mark rejected
0825   auto const reject = pixelTrack::Quality::loose;
0826   /// min quality of good
0827   auto const good = pixelTrack::Quality::loose;
0828 
0829   auto &hitToTuple = *phitToTuple;
0830   auto const &tracks = *ptracks;
0831 
0832   int first = blockDim.x * blockIdx.x + threadIdx.x;
0833   for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0834     if (hitToTuple.size(idx) < 2)
0835       continue;
0836 
0837     float mc = maxScore;
0838     uint16_t im = tkNotFound;
0839 
0840     // choose best tip!  (should we first find best quality???)
0841     for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
0842       auto const it = *ip;
0843       if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) {
0844         mc = std::abs(tracks.tip(it));
0845         im = it;
0846       }
0847     }
0848 
0849     if (tkNotFound == im)
0850       continue;
0851 
0852     // mark worse ambiguities
0853     for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
0854       auto const it = *ip;
0855       if (quality[it] > reject && tracks.isTriplet(it) && it != im)
0856         quality[it] = reject;  //no race:  simple assignment of the same constant
0857     }
0858 
0859   }  // loop over hits
0860 }
0861 
0862 __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp,
0863                                             HitContainer const *__restrict__ ptuples,
0864                                             TkSoA const *__restrict__ ptracks,
0865                                             Quality const *__restrict__ quality,
0866                                             CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple,
0867                                             int32_t firstPrint,
0868                                             int32_t lastPrint,
0869                                             int iev) {
0870   constexpr auto loose = pixelTrack::Quality::loose;
0871   auto const &hh = *hhp;
0872   auto const &foundNtuplets = *ptuples;
0873   auto const &tracks = *ptracks;
0874   int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x;
0875   for (int i = first, np = std::min(lastPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) {
0876     auto nh = foundNtuplets.size(i);
0877     if (nh < 3)
0878       continue;
0879     if (quality[i] < loose)
0880       continue;
0881     printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n",
0882            10000 * iev + i,
0883            int(quality[i]),
0884            nh,
0885            tracks.nLayers(i),
0886            tracks.charge(i),
0887            tracks.pt(i),
0888            tracks.eta(i),
0889            tracks.phi(i),
0890            tracks.tip(i),
0891            tracks.zip(i),
0892            //           asinhf(fit_results[i].par(3)),
0893            tracks.chi2(i),
0894            hh.zGlobal(*foundNtuplets.begin(i)),
0895            hh.zGlobal(*(foundNtuplets.begin(i) + 1)),
0896            hh.zGlobal(*(foundNtuplets.begin(i) + 2)),
0897            nh > 3 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 3))) : 0,
0898            nh > 4 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 4))) : 0,
0899            nh > 5 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 5))) : 0,
0900            nh > 6 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + nh - 1))) : 0);
0901   }
0902 }
0903 
0904 __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *counters) {
0905   auto const &c = *counters;
0906   printf(
0907       "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks  |  nLooseTracks  |  nGoodTracks | nUsedHits | "
0908       "nDupHits | "
0909       "nFishCells | "
0910       "nKilledCells | "
0911       "nUsedCells | nZeroTrackCells ||\n");
0912   printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
0913          c.nEvents,
0914          c.nHits,
0915          c.nCells,
0916          c.nTuples,
0917          c.nFitTracks,
0918          c.nLooseTracks,
0919          c.nGoodTracks,
0920          c.nUsedHits,
0921          c.nDupHits,
0922          c.nFishCells,
0923          c.nKilledCells,
0924          c.nEmptyCells,
0925          c.nZeroTrackCells);
0926   printf("Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f|  %.3f|  %.3f||\n",
0927          c.nEvents,
0928          c.nHits / double(c.nEvents),
0929          c.nCells / double(c.nEvents),
0930          c.nTuples / double(c.nEvents),
0931          c.nFitTracks / double(c.nEvents),
0932          c.nLooseTracks / double(c.nEvents),
0933          c.nGoodTracks / double(c.nEvents),
0934          c.nUsedHits / double(c.nEvents),
0935          c.nDupHits / double(c.nEvents),
0936          c.nFishCells / double(c.nCells),
0937          c.nKilledCells / double(c.nCells),
0938          c.nEmptyCells / double(c.nCells),
0939          c.nZeroTrackCells / double(c.nCells));
0940 }