File indexing completed on 2022-04-04 00:15:51
0001
0002
0003
0004
0005
0006
0007
0008 #include <cmath>
0009 #include <cstdint>
0010 #include <limits>
0011
0012 #include <cuda_runtime.h>
0013
0014 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0015 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0016 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
0017
0018 #include "CAConstants.h"
0019 #include "CAHitNtupletGeneratorKernels.h"
0020 #include "GPUCACell.h"
0021 #include "gpuFishbone.h"
0022 #include "gpuPixelDoublets.h"
0023
0024 using HitsOnGPU = TrackingRecHit2DSOAView;
0025 using HitsOnCPU = TrackingRecHit2DGPU;
0026
0027 using HitToTuple = caConstants::HitToTuple;
0028 using TupleMultiplicity = caConstants::TupleMultiplicity;
0029
0030 using Quality = pixelTrack::Quality;
0031 using TkSoA = pixelTrack::TrackSoA;
0032 using HitContainer = pixelTrack::HitContainer;
0033
0034 namespace {
0035
0036 constexpr uint16_t tkNotFound = std::numeric_limits<uint16_t>::max();
0037 constexpr float maxScore = std::numeric_limits<float>::max();
0038 constexpr float nSigma2 = 25.f;
0039
0040 }
0041
0042 __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
0043 caConstants::TupleMultiplicity const *tupleMultiplicity,
0044 CAHitNtupletGeneratorKernelsGPU::HitToTuple const *hitToTuple,
0045 cms::cuda::AtomicPairCounter *apc,
0046 GPUCACell const *__restrict__ cells,
0047 uint32_t const *__restrict__ nCells,
0048 gpuPixelDoublets::CellNeighborsVector const *cellNeighbors,
0049 gpuPixelDoublets::CellTracksVector const *cellTracks,
0050 GPUCACell::OuterHitOfCell const isOuterHitOfCell,
0051 int32_t nHits,
0052 uint32_t maxNumberOfDoublets,
0053 CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
0054 auto first = threadIdx.x + blockIdx.x * blockDim.x;
0055
0056 auto &c = *counters;
0057
0058 if (0 == first) {
0059 atomicAdd(&c.nEvents, 1);
0060 atomicAdd(&c.nHits, nHits);
0061 atomicAdd(&c.nCells, *nCells);
0062 atomicAdd(&c.nTuples, apc->get().m);
0063 atomicAdd(&c.nFitTracks, tupleMultiplicity->size());
0064 }
0065
0066 #ifdef NTUPLE_DEBUG
0067 if (0 == first) {
0068 printf("number of found cells %d, found tuples %d with total hits %d out of %d %d\n",
0069 *nCells,
0070 apc->get().m,
0071 apc->get().n,
0072 nHits,
0073 hitToTuple->totOnes());
0074 if (apc->get().m < caConstants::maxNumberOfQuadruplets) {
0075 assert(foundNtuplets->size(apc->get().m) == 0);
0076 assert(foundNtuplets->size() == apc->get().n);
0077 }
0078 }
0079
0080 for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) {
0081 if (foundNtuplets->size(idx) > 7)
0082 printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
0083 assert(foundNtuplets->size(idx) <= caConstants::maxHitsOnTrack);
0084 for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
0085 assert(int(*ih) < nHits);
0086 }
0087 #endif
0088
0089 if (0 == first) {
0090 if (apc->get().m >= caConstants::maxNumberOfQuadruplets)
0091 printf("Tuples overflow\n");
0092 if (*nCells >= maxNumberOfDoublets)
0093 printf("Cells overflow\n");
0094 if (cellNeighbors && cellNeighbors->full())
0095 printf("cellNeighbors overflow\n");
0096 if (cellTracks && cellTracks->full())
0097 printf("cellTracks overflow\n");
0098 if (int(hitToTuple->nOnes()) < nHits)
0099 printf("ERROR hitToTuple overflow %d %d\n", hitToTuple->nOnes(), nHits);
0100 }
0101
0102 for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0103 auto const &thisCell = cells[idx];
0104 if (thisCell.hasFishbone() && !thisCell.isKilled())
0105 atomicAdd(&c.nFishCells, 1);
0106 if (thisCell.outerNeighbors().full())
0107 printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId());
0108 if (thisCell.tracks().full())
0109 printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId());
0110 if (thisCell.isKilled())
0111 atomicAdd(&c.nKilledCells, 1);
0112 if (!thisCell.unused())
0113 atomicAdd(&c.nEmptyCells, 1);
0114 if ((0 == hitToTuple->size(thisCell.inner_hit_id())) && (0 == hitToTuple->size(thisCell.outer_hit_id())))
0115 atomicAdd(&c.nZeroTrackCells, 1);
0116 }
0117
0118 for (int idx = first, nt = nHits - isOuterHitOfCell.offset; idx < nt; idx += gridDim.x * blockDim.x) {
0119 if (isOuterHitOfCell.container[idx].full())
0120 printf("OuterHitOfCell overflow %d\n", idx);
0121 }
0122 }
0123
0124 __global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *__restrict__ nCells, Quality *quality) {
0125 constexpr auto reject = pixelTrack::Quality::dup;
0126
0127 auto first = threadIdx.x + blockIdx.x * blockDim.x;
0128 for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0129 auto const &thisCell = cells[idx];
0130 if (!thisCell.isKilled())
0131 continue;
0132
0133 for (auto it : thisCell.tracks())
0134 quality[it] = reject;
0135 }
0136 }
0137
0138
0139
0140 __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
0141 uint32_t const *__restrict__ nCells,
0142 TkSoA const *__restrict__ ptracks,
0143 Quality *quality,
0144 bool dupPassThrough) {
0145
0146 constexpr auto reject = pixelTrack::Quality::edup;
0147
0148 auto const &tracks = *ptracks;
0149
0150 assert(nCells);
0151 auto first = threadIdx.x + blockIdx.x * blockDim.x;
0152 for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0153 auto const &thisCell = cells[idx];
0154
0155 if (thisCell.tracks().size() < 2)
0156 continue;
0157
0158 int8_t maxNl = 0;
0159
0160
0161 for (auto it : thisCell.tracks()) {
0162 auto nl = tracks.nLayers(it);
0163 maxNl = std::max(nl, maxNl);
0164 }
0165
0166
0167
0168
0169
0170 for (auto it : thisCell.tracks()) {
0171 if (tracks.nLayers(it) < maxNl)
0172 quality[it] = reject;
0173 }
0174 }
0175 }
0176
0177
0178 __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
0179 uint32_t const *__restrict__ nCells,
0180 TkSoA *__restrict__ tracks,
0181 bool dupPassThrough) {
0182
0183 auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
0184 constexpr auto loose = pixelTrack::Quality::loose;
0185
0186 assert(nCells);
0187
0188 auto first = threadIdx.x + blockIdx.x * blockDim.x;
0189 for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0190 auto const &thisCell = cells[idx];
0191 if (thisCell.tracks().size() < 2)
0192 continue;
0193
0194 float mc = maxScore;
0195 uint16_t im = tkNotFound;
0196
0197
0198
0199
0200
0201
0202
0203
0204
0205 auto score = [&](auto it) { return std::abs(tracks->tip(it)); };
0206
0207
0208 int ntr = thisCell.tracks().size();
0209 for (int i = 0; i < ntr - 1; ++i) {
0210 auto it = thisCell.tracks()[i];
0211 auto qi = tracks->quality(it);
0212 if (qi <= reject)
0213 continue;
0214 auto opi = tracks->stateAtBS.state(it)(2);
0215 auto e2opi = tracks->stateAtBS.covariance(it)(9);
0216 auto cti = tracks->stateAtBS.state(it)(3);
0217 auto e2cti = tracks->stateAtBS.covariance(it)(12);
0218 for (auto j = i + 1; j < ntr; ++j) {
0219 auto jt = thisCell.tracks()[j];
0220 auto qj = tracks->quality(jt);
0221 if (qj <= reject)
0222 continue;
0223 #ifdef GPU_DEBUG
0224 if (foundNtuplets->size(it) != foundNtuplets->size(jt))
0225 printf(" a mess\n");
0226 #endif
0227 auto opj = tracks->stateAtBS.state(jt)(2);
0228 auto ctj = tracks->stateAtBS.state(jt)(3);
0229 auto dct = nSigma2 * (tracks->stateAtBS.covariance(jt)(12) + e2cti);
0230 if ((cti - ctj) * (cti - ctj) > dct)
0231 continue;
0232 auto dop = nSigma2 * (tracks->stateAtBS.covariance(jt)(9) + e2opi);
0233 if ((opi - opj) * (opi - opj) > dop)
0234 continue;
0235 if ((qj < qi) || (qj == qi && score(it) < score(jt)))
0236 tracks->quality(jt) = reject;
0237 else {
0238 tracks->quality(it) = reject;
0239 break;
0240 }
0241 }
0242 }
0243
0244
0245 auto maxQual = reject;
0246 for (auto it : thisCell.tracks()) {
0247 if (tracks->quality(it) > maxQual)
0248 maxQual = tracks->quality(it);
0249 }
0250
0251 if (maxQual <= loose)
0252 continue;
0253
0254
0255 for (auto it : thisCell.tracks()) {
0256 if (tracks->quality(it) == maxQual && score(it) < mc) {
0257 mc = score(it);
0258 im = it;
0259 }
0260 }
0261
0262 if (tkNotFound == im)
0263 continue;
0264
0265
0266 for (auto it : thisCell.tracks()) {
0267 if (tracks->quality(it) > loose && it != im)
0268 tracks->quality(it) = loose;
0269 }
0270 }
0271 }
0272
0273 __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
0274 cms::cuda::AtomicPairCounter *apc2,
0275 GPUCACell::Hits const *__restrict__ hhp,
0276 GPUCACell *cells,
0277 uint32_t const *__restrict__ nCells,
0278 gpuPixelDoublets::CellNeighborsVector *cellNeighbors,
0279 GPUCACell::OuterHitOfCell const isOuterHitOfCell,
0280 float hardCurvCut,
0281 float ptmin,
0282 float CAThetaCutBarrel,
0283 float CAThetaCutForward,
0284 float dcaCutInnerTriplet,
0285 float dcaCutOuterTriplet) {
0286 auto const &hh = *hhp;
0287
0288 auto firstCellIndex = threadIdx.y + blockIdx.y * blockDim.y;
0289 auto first = threadIdx.x;
0290 auto stride = blockDim.x;
0291
0292 if (0 == (firstCellIndex + first)) {
0293 (*apc1) = 0;
0294 (*apc2) = 0;
0295 }
0296
0297 for (int idx = firstCellIndex, nt = (*nCells); idx < nt; idx += gridDim.y * blockDim.y) {
0298 auto cellIndex = idx;
0299 auto &thisCell = cells[idx];
0300 auto innerHitId = thisCell.inner_hit_id();
0301 if (int(innerHitId) < isOuterHitOfCell.offset)
0302 continue;
0303 int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
0304 auto vi = isOuterHitOfCell[innerHitId].data();
0305
0306 auto ri = thisCell.inner_r(hh);
0307 auto zi = thisCell.inner_z(hh);
0308
0309 auto ro = thisCell.outer_r(hh);
0310 auto zo = thisCell.outer_z(hh);
0311 auto isBarrel = thisCell.inner_detIndex(hh) < caConstants::last_barrel_detIndex;
0312
0313 for (int j = first; j < numberOfPossibleNeighbors; j += stride) {
0314 auto otherCell = __ldg(vi + j);
0315 auto &oc = cells[otherCell];
0316 auto r1 = oc.inner_r(hh);
0317 auto z1 = oc.inner_z(hh);
0318 bool aligned = GPUCACell::areAlignedRZ(
0319 r1,
0320 z1,
0321 ri,
0322 zi,
0323 ro,
0324 zo,
0325 ptmin,
0326 isBarrel ? CAThetaCutBarrel : CAThetaCutForward);
0327 if (aligned && thisCell.dcaCut(hh,
0328 oc,
0329 oc.inner_detIndex(hh) < caConstants::last_bpix1_detIndex ? dcaCutInnerTriplet
0330 : dcaCutOuterTriplet,
0331 hardCurvCut)) {
0332 oc.addOuterNeighbor(cellIndex, *cellNeighbors);
0333 thisCell.setStatusBits(GPUCACell::StatusBit::kUsed);
0334 oc.setStatusBits(GPUCACell::StatusBit::kUsed);
0335 }
0336 }
0337 }
0338 }
0339
0340 __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
0341 GPUCACell *__restrict__ cells,
0342 uint32_t const *nCells,
0343 gpuPixelDoublets::CellTracksVector *cellTracks,
0344 HitContainer *foundNtuplets,
0345 cms::cuda::AtomicPairCounter *apc,
0346 Quality *__restrict__ quality,
0347 unsigned int minHitsPerNtuplet) {
0348
0349 auto const &hh = *hhp;
0350
0351 auto first = threadIdx.x + blockIdx.x * blockDim.x;
0352 for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0353 auto const &thisCell = cells[idx];
0354 if (thisCell.isKilled())
0355 continue;
0356
0357 if (thisCell.outerNeighbors().empty())
0358 continue;
0359 auto pid = thisCell.layerPairId();
0360 auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12;
0361 if (doit) {
0362 GPUCACell::TmpTuple stack;
0363 stack.reset();
0364 thisCell.find_ntuplets<6>(
0365 hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
0366 assert(stack.empty());
0367
0368 }
0369 }
0370 }
0371
0372 __global__ void kernel_mark_used(GPUCACell *__restrict__ cells, uint32_t const *nCells) {
0373 auto first = threadIdx.x + blockIdx.x * blockDim.x;
0374 for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
0375 auto &thisCell = cells[idx];
0376 if (!thisCell.tracks().empty())
0377 thisCell.setStatusBits(GPUCACell::StatusBit::kInTrack);
0378 }
0379 }
0380
0381 __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
0382 Quality const *__restrict__ quality,
0383 caConstants::TupleMultiplicity *tupleMultiplicity) {
0384 auto first = blockIdx.x * blockDim.x + threadIdx.x;
0385 for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
0386 auto nhits = foundNtuplets->size(it);
0387 if (nhits < 3)
0388 continue;
0389 if (quality[it] == pixelTrack::Quality::edup)
0390 continue;
0391 assert(quality[it] == pixelTrack::Quality::bad);
0392 if (nhits > 7)
0393 printf("wrong mult %d %d\n", it, nhits);
0394 assert(nhits <= caConstants::maxHitsOnTrack);
0395 tupleMultiplicity->count(nhits);
0396 }
0397 }
0398
0399 __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
0400 Quality const *__restrict__ quality,
0401 caConstants::TupleMultiplicity *tupleMultiplicity) {
0402 auto first = blockIdx.x * blockDim.x + threadIdx.x;
0403 for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
0404 auto nhits = foundNtuplets->size(it);
0405 if (nhits < 3)
0406 continue;
0407 if (quality[it] == pixelTrack::Quality::edup)
0408 continue;
0409 assert(quality[it] == pixelTrack::Quality::bad);
0410 if (nhits > 7)
0411 printf("wrong mult %d %d\n", it, nhits);
0412 assert(nhits <= caConstants::maxHitsOnTrack);
0413 tupleMultiplicity->fill(nhits, it);
0414 }
0415 }
0416
0417 __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
0418 TkSoA const *__restrict__ tracks,
0419 CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts,
0420 Quality *__restrict__ quality) {
0421 int first = blockDim.x * blockIdx.x + threadIdx.x;
0422 for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
0423 auto nhits = tuples->size(it);
0424 if (nhits == 0)
0425 break;
0426
0427
0428 if (quality[it] == pixelTrack::Quality::edup)
0429 continue;
0430
0431 assert(quality[it] == pixelTrack::Quality::bad);
0432
0433
0434 if (nhits < 3)
0435 continue;
0436
0437
0438 bool isNaN = false;
0439 for (int i = 0; i < 5; ++i) {
0440 isNaN |= std::isnan(tracks->stateAtBS.state(it)(i));
0441 }
0442 if (isNaN) {
0443 #ifdef NTUPLE_DEBUG
0444 printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it));
0445 #endif
0446 continue;
0447 }
0448
0449 quality[it] = pixelTrack::Quality::strict;
0450
0451
0452
0453 auto roughLog = [](float x) {
0454
0455
0456 union IF {
0457 uint32_t i;
0458 float f;
0459 };
0460 IF z;
0461 z.f = x;
0462 uint32_t lsb = 1 < 21;
0463 z.i += lsb;
0464 z.i >>= 21;
0465 auto f = z.i & 3;
0466 int ex = int(z.i >> 2) - 127;
0467
0468
0469
0470 const float frac[4] = {0.160497f, 0.452172f, 0.694562f, 0.901964f};
0471 return float(ex) + frac[f];
0472 };
0473
0474
0475 float pt = std::min<float>(tracks->pt(it), cuts.chi2MaxPt);
0476 float chi2Cut = cuts.chi2Scale * (cuts.chi2Coeff[0] + roughLog(pt) * cuts.chi2Coeff[1]);
0477 if (tracks->chi2(it) >= chi2Cut) {
0478 #ifdef NTUPLE_FIT_DEBUG
0479 printf("Bad chi2 %d size %d pt %f eta %f chi2 %f\n",
0480 it,
0481 tuples->size(it),
0482 tracks->pt(it),
0483 tracks->eta(it),
0484 tracks->chi2(it));
0485 #endif
0486 continue;
0487 }
0488
0489 quality[it] = pixelTrack::Quality::tight;
0490
0491
0492
0493
0494
0495
0496 auto const ®ion = (nhits > 3) ? cuts.quadruplet : cuts.triplet;
0497 bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and
0498 (std::abs(tracks->zip(it)) < region.maxZip);
0499
0500 if (isOk)
0501 quality[it] = pixelTrack::Quality::highPurity;
0502 }
0503 }
0504
0505 __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
0506 Quality const *__restrict__ quality,
0507 CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
0508 int first = blockDim.x * blockIdx.x + threadIdx.x;
0509 for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0510 if (tuples->size(idx) == 0)
0511 break;
0512 if (quality[idx] < pixelTrack::Quality::loose)
0513 continue;
0514 atomicAdd(&(counters->nLooseTracks), 1);
0515 if (quality[idx] < pixelTrack::Quality::strict)
0516 continue;
0517 atomicAdd(&(counters->nGoodTracks), 1);
0518 }
0519 }
0520
0521 __global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
0522 Quality const *__restrict__ quality,
0523 CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
0524 int first = blockDim.x * blockIdx.x + threadIdx.x;
0525 for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0526 if (tuples->size(idx) == 0)
0527 break;
0528 for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
0529 hitToTuple->count(*h);
0530 }
0531 }
0532
0533 __global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
0534 Quality const *__restrict__ quality,
0535 CAHitNtupletGeneratorKernelsGPU::HitToTuple *hitToTuple) {
0536 int first = blockDim.x * blockIdx.x + threadIdx.x;
0537 for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0538 if (tuples->size(idx) == 0)
0539 break;
0540 for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
0541 hitToTuple->fill(*h, idx);
0542 }
0543 }
0544
0545 __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples,
0546 TrackingRecHit2DSOAView const *__restrict__ hhp,
0547 HitContainer *__restrict__ hitDetIndices) {
0548 int first = blockDim.x * blockIdx.x + threadIdx.x;
0549
0550 for (int idx = first, ntot = tuples->totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0551 hitDetIndices->off[idx] = tuples->off[idx];
0552 }
0553
0554 auto const &hh = *hhp;
0555 auto nhits = hh.nHits();
0556 for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) {
0557 assert(tuples->content[idx] < nhits);
0558 hitDetIndices->content[idx] = hh.detectorIndex(tuples->content[idx]);
0559 }
0560 }
0561
0562 __global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, cms::cuda::AtomicPairCounter *apc) {
0563 auto &tracks = *ptracks;
0564 auto first = blockIdx.x * blockDim.x + threadIdx.x;
0565 auto ntracks = apc->get().m;
0566 if (0 == first)
0567 tracks.setNTracks(ntracks);
0568 for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) {
0569 auto nHits = tracks.nHits(idx);
0570 assert(nHits >= 3);
0571 tracks.nLayers(idx) = tracks.computeNumberOfLayers(idx);
0572 }
0573 }
0574
0575 __global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ hitToTuple,
0576 CAHitNtupletGeneratorKernelsGPU::Counters *counters) {
0577 auto &c = *counters;
0578 int first = blockDim.x * blockIdx.x + threadIdx.x;
0579 for (int idx = first, ntot = hitToTuple->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0580 if (hitToTuple->size(idx) == 0)
0581 continue;
0582 atomicAdd(&c.nUsedHits, 1);
0583 if (hitToTuple->size(idx) > 1)
0584 atomicAdd(&c.nDupHits, 1);
0585 }
0586 }
0587
0588 __global__ void kernel_countSharedHit(int *__restrict__ nshared,
0589 HitContainer const *__restrict__ ptuples,
0590 Quality const *__restrict__ quality,
0591 CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
0592 constexpr auto loose = pixelTrack::Quality::loose;
0593
0594 auto &hitToTuple = *phitToTuple;
0595 auto const &foundNtuplets = *ptuples;
0596
0597 int first = blockDim.x * blockIdx.x + threadIdx.x;
0598 for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0599 if (hitToTuple.size(idx) < 2)
0600 continue;
0601
0602 int nt = 0;
0603
0604
0605 for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
0606 if (quality[*it] < loose)
0607 continue;
0608 ++nt;
0609 }
0610
0611 if (nt < 2)
0612 continue;
0613
0614
0615 for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
0616 if (foundNtuplets.size(*it) > 3)
0617 continue;
0618 atomicAdd(&nshared[*it], 1);
0619 }
0620
0621 }
0622 }
0623
0624 __global__ void kernel_markSharedHit(int const *__restrict__ nshared,
0625 HitContainer const *__restrict__ tuples,
0626 Quality *__restrict__ quality,
0627 bool dupPassThrough) {
0628
0629 constexpr auto dup = pixelTrack::Quality::dup;
0630 constexpr auto loose = pixelTrack::Quality::loose;
0631
0632
0633
0634 auto const reject = dupPassThrough ? loose : dup;
0635
0636 int first = blockDim.x * blockIdx.x + threadIdx.x;
0637 for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0638 if (tuples->size(idx) == 0)
0639 break;
0640 if (quality[idx] <= reject)
0641 continue;
0642 if (nshared[idx] > 2)
0643 quality[idx] = reject;
0644 }
0645 }
0646
0647
0648 __global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks,
0649 Quality *__restrict__ quality,
0650 uint16_t nmin,
0651 bool dupPassThrough,
0652 CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
0653
0654 auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
0655
0656 auto &hitToTuple = *phitToTuple;
0657 auto const &tracks = *ptracks;
0658
0659 int first = blockDim.x * blockIdx.x + threadIdx.x;
0660 for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0661 if (hitToTuple.size(idx) < 2)
0662 continue;
0663
0664
0665
0666
0667
0668
0669
0670 auto score = [&](auto it, auto nl) { return std::abs(tracks.tip(it)); };
0671
0672
0673 for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) {
0674 auto const it = *ip;
0675 auto qi = quality[it];
0676 if (qi <= reject)
0677 continue;
0678 auto opi = tracks.stateAtBS.state(it)(2);
0679 auto e2opi = tracks.stateAtBS.covariance(it)(9);
0680 auto cti = tracks.stateAtBS.state(it)(3);
0681 auto e2cti = tracks.stateAtBS.covariance(it)(12);
0682 auto nli = tracks.nLayers(it);
0683 for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) {
0684 auto const jt = *jp;
0685 auto qj = quality[jt];
0686 if (qj <= reject)
0687 continue;
0688 auto opj = tracks.stateAtBS.state(jt)(2);
0689 auto ctj = tracks.stateAtBS.state(jt)(3);
0690 auto dct = nSigma2 * (tracks.stateAtBS.covariance(jt)(12) + e2cti);
0691 if ((cti - ctj) * (cti - ctj) > dct)
0692 continue;
0693 auto dop = nSigma2 * (tracks.stateAtBS.covariance(jt)(9) + e2opi);
0694 if ((opi - opj) * (opi - opj) > dop)
0695 continue;
0696 auto nlj = tracks.nLayers(jt);
0697 if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj)))))
0698 quality[jt] = reject;
0699 else {
0700 quality[it] = reject;
0701 break;
0702 }
0703 }
0704 }
0705 }
0706 }
0707
0708 __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp,
0709 TkSoA const *__restrict__ ptracks,
0710 Quality *__restrict__ quality,
0711 int nmin,
0712 bool dupPassThrough,
0713 CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
0714
0715 auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
0716
0717 auto const longTqual = pixelTrack::Quality::highPurity;
0718
0719 auto &hitToTuple = *phitToTuple;
0720 auto const &tracks = *ptracks;
0721
0722 auto const &hh = *hhp;
0723 int l1end = hh.hitsLayerStart()[1];
0724
0725 int first = blockDim.x * blockIdx.x + threadIdx.x;
0726 for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0727 if (hitToTuple.size(idx) < 2)
0728 continue;
0729
0730 int8_t maxNl = 0;
0731
0732
0733 for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
0734 if (quality[*it] < longTqual)
0735 continue;
0736
0737 auto nl = tracks.nLayers(*it);
0738 maxNl = std::max(nl, maxNl);
0739 }
0740
0741 if (maxNl < 4)
0742 continue;
0743
0744
0745
0746
0747
0748 for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
0749 auto nl = tracks.nLayers(*it);
0750
0751
0752 if (idx < l1end and nl > nmin)
0753 continue;
0754
0755 if (nl < maxNl && quality[*it] > reject)
0756 quality[*it] = reject;
0757 }
0758 }
0759 }
0760
0761 __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks,
0762 Quality *__restrict__ quality,
0763 uint16_t nmin,
0764 bool dupPassThrough,
0765 CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
0766
0767 auto const reject = pixelTrack::Quality::loose;
0768
0769 auto const good = pixelTrack::Quality::strict;
0770
0771 auto &hitToTuple = *phitToTuple;
0772 auto const &tracks = *ptracks;
0773
0774 int first = blockDim.x * blockIdx.x + threadIdx.x;
0775 for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0776 if (hitToTuple.size(idx) < 2)
0777 continue;
0778
0779 float mc = maxScore;
0780 uint16_t im = tkNotFound;
0781 bool onlyTriplets = true;
0782
0783
0784 for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
0785 if (quality[*it] <= good)
0786 continue;
0787 onlyTriplets &= tracks.isTriplet(*it);
0788 if (!onlyTriplets)
0789 break;
0790 }
0791
0792
0793 if (!onlyTriplets)
0794 continue;
0795
0796
0797 for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
0798 auto const it = *ip;
0799 if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) {
0800 mc = std::abs(tracks.tip(it));
0801 im = it;
0802 }
0803 }
0804
0805 if (tkNotFound == im)
0806 continue;
0807
0808
0809 for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
0810 auto const it = *ip;
0811 if (quality[it] > reject && it != im)
0812 quality[it] = reject;
0813 }
0814
0815 }
0816 }
0817
0818 __global__ void kernel_simpleTripletCleaner(
0819 TkSoA const *__restrict__ ptracks,
0820 Quality *__restrict__ quality,
0821 uint16_t nmin,
0822 bool dupPassThrough,
0823 CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
0824
0825 auto const reject = pixelTrack::Quality::loose;
0826
0827 auto const good = pixelTrack::Quality::loose;
0828
0829 auto &hitToTuple = *phitToTuple;
0830 auto const &tracks = *ptracks;
0831
0832 int first = blockDim.x * blockIdx.x + threadIdx.x;
0833 for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
0834 if (hitToTuple.size(idx) < 2)
0835 continue;
0836
0837 float mc = maxScore;
0838 uint16_t im = tkNotFound;
0839
0840
0841 for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
0842 auto const it = *ip;
0843 if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) {
0844 mc = std::abs(tracks.tip(it));
0845 im = it;
0846 }
0847 }
0848
0849 if (tkNotFound == im)
0850 continue;
0851
0852
0853 for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
0854 auto const it = *ip;
0855 if (quality[it] > reject && tracks.isTriplet(it) && it != im)
0856 quality[it] = reject;
0857 }
0858
0859 }
0860 }
0861
0862 __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp,
0863 HitContainer const *__restrict__ ptuples,
0864 TkSoA const *__restrict__ ptracks,
0865 Quality const *__restrict__ quality,
0866 CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple,
0867 int32_t firstPrint,
0868 int32_t lastPrint,
0869 int iev) {
0870 constexpr auto loose = pixelTrack::Quality::loose;
0871 auto const &hh = *hhp;
0872 auto const &foundNtuplets = *ptuples;
0873 auto const &tracks = *ptracks;
0874 int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x;
0875 for (int i = first, np = std::min(lastPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) {
0876 auto nh = foundNtuplets.size(i);
0877 if (nh < 3)
0878 continue;
0879 if (quality[i] < loose)
0880 continue;
0881 printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n",
0882 10000 * iev + i,
0883 int(quality[i]),
0884 nh,
0885 tracks.nLayers(i),
0886 tracks.charge(i),
0887 tracks.pt(i),
0888 tracks.eta(i),
0889 tracks.phi(i),
0890 tracks.tip(i),
0891 tracks.zip(i),
0892
0893 tracks.chi2(i),
0894 hh.zGlobal(*foundNtuplets.begin(i)),
0895 hh.zGlobal(*(foundNtuplets.begin(i) + 1)),
0896 hh.zGlobal(*(foundNtuplets.begin(i) + 2)),
0897 nh > 3 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 3))) : 0,
0898 nh > 4 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 4))) : 0,
0899 nh > 5 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 5))) : 0,
0900 nh > 6 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + nh - 1))) : 0);
0901 }
0902 }
0903
0904 __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *counters) {
0905 auto const &c = *counters;
0906 printf(
0907 "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks | nLooseTracks | nGoodTracks | nUsedHits | "
0908 "nDupHits | "
0909 "nFishCells | "
0910 "nKilledCells | "
0911 "nUsedCells | nZeroTrackCells ||\n");
0912 printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
0913 c.nEvents,
0914 c.nHits,
0915 c.nCells,
0916 c.nTuples,
0917 c.nFitTracks,
0918 c.nLooseTracks,
0919 c.nGoodTracks,
0920 c.nUsedHits,
0921 c.nDupHits,
0922 c.nFishCells,
0923 c.nKilledCells,
0924 c.nEmptyCells,
0925 c.nZeroTrackCells);
0926 printf("Counters Norm %lld || %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.3f| %.3f| %.3f| %.3f||\n",
0927 c.nEvents,
0928 c.nHits / double(c.nEvents),
0929 c.nCells / double(c.nEvents),
0930 c.nTuples / double(c.nEvents),
0931 c.nFitTracks / double(c.nEvents),
0932 c.nLooseTracks / double(c.nEvents),
0933 c.nGoodTracks / double(c.nEvents),
0934 c.nUsedHits / double(c.nEvents),
0935 c.nDupHits / double(c.nEvents),
0936 c.nFishCells / double(c.nCells),
0937 c.nKilledCells / double(c.nCells),
0938 c.nEmptyCells / double(c.nCells),
0939 c.nZeroTrackCells / double(c.nCells));
0940 }