Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2021-12-02 03:39:33

0001 #ifndef RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoubletsAlgos_h
0002 #define RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoubletsAlgos_h
0003 
0004 #include <algorithm>
0005 #include <cmath>
0006 #include <cstdint>
0007 #include <cstdio>
0008 #include <limits>
0009 
0010 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
0011 #include "DataFormats/Math/interface/approx_atan2.h"
0012 #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
0013 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0014 
0015 #include "CAConstants.h"
0016 #include "GPUCACell.h"
0017 
0018 namespace gpuPixelDoublets {
0019 
0020   using CellNeighbors = caConstants::CellNeighbors;
0021   using CellTracks = caConstants::CellTracks;
0022   using CellNeighborsVector = caConstants::CellNeighborsVector;
0023   using CellTracksVector = caConstants::CellTracksVector;
0024 
0025   __device__ __forceinline__ void doubletsFromHisto(uint8_t const* __restrict__ layerPairs,
0026                                                     uint32_t nPairs,
0027                                                     GPUCACell* cells,
0028                                                     uint32_t* nCells,
0029                                                     CellNeighborsVector* cellNeighbors,
0030                                                     CellTracksVector* cellTracks,
0031                                                     TrackingRecHit2DSOAView const& __restrict__ hh,
0032                                                     GPUCACell::OuterHitOfCell isOuterHitOfCell,
0033                                                     int16_t const* __restrict__ phicuts,
0034                                                     float const* __restrict__ minz,
0035                                                     float const* __restrict__ maxz,
0036                                                     float const* __restrict__ maxr,
0037                                                     bool ideal_cond,
0038                                                     bool doClusterCut,
0039                                                     bool doZ0Cut,
0040                                                     bool doPtCut,
0041                                                     uint32_t maxNumOfDoublets) {
0042     // ysize cuts (z in the barrel)  times 8
0043     // these are used if doClusterCut is true
0044     constexpr int minYsizeB1 = 36;
0045     constexpr int minYsizeB2 = 28;
0046     constexpr int maxDYsize12 = 28;
0047     constexpr int maxDYsize = 20;
0048     constexpr int maxDYPred = 20;
0049     constexpr float dzdrFact = 8 * 0.0285 / 0.015;  // from dz/dr to "DY"
0050 
0051     bool isOuterLadder = ideal_cond;
0052 
0053     using PhiBinner = TrackingRecHit2DSOAView::PhiBinner;
0054 
0055     auto const& __restrict__ phiBinner = hh.phiBinner();
0056     uint32_t const* __restrict__ offsets = hh.hitsLayerStart();
0057     assert(offsets);
0058 
0059     auto layerSize = [=](uint8_t li) { return offsets[li + 1] - offsets[li]; };
0060 
0061     // nPairsMax to be optimized later (originally was 64).
0062     // If it should be much bigger, consider using a block-wide parallel prefix scan,
0063     // e.g. see  https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html
0064     const int nPairsMax = caConstants::maxNumberOfLayerPairs;
0065     assert(nPairs <= nPairsMax);
0066     __shared__ uint32_t innerLayerCumulativeSize[nPairsMax];
0067     __shared__ uint32_t ntot;
0068     if (threadIdx.y == 0 && threadIdx.x == 0) {
0069       innerLayerCumulativeSize[0] = layerSize(layerPairs[0]);
0070       for (uint32_t i = 1; i < nPairs; ++i) {
0071         innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i - 1] + layerSize(layerPairs[2 * i]);
0072       }
0073       ntot = innerLayerCumulativeSize[nPairs - 1];
0074     }
0075     __syncthreads();
0076 
0077     // x runs faster
0078     auto idy = blockIdx.y * blockDim.y + threadIdx.y;
0079     auto first = threadIdx.x;
0080     auto stride = blockDim.x;
0081 
0082     uint32_t pairLayerId = 0;  // cannot go backward
0083     for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y) {
0084       while (j >= innerLayerCumulativeSize[pairLayerId++])
0085         ;
0086       --pairLayerId;  // move to lower_bound ??
0087 
0088       assert(pairLayerId < nPairs);
0089       assert(j < innerLayerCumulativeSize[pairLayerId]);
0090       assert(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId - 1]);
0091 
0092       uint8_t inner = layerPairs[2 * pairLayerId];
0093       uint8_t outer = layerPairs[2 * pairLayerId + 1];
0094       assert(outer > inner);
0095 
0096       auto hoff = PhiBinner::histOff(outer);
0097 
0098       auto i = (0 == pairLayerId) ? j : j - innerLayerCumulativeSize[pairLayerId - 1];
0099       i += offsets[inner];
0100 
0101       // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j);
0102 
0103       assert(i >= offsets[inner]);
0104       assert(i < offsets[inner + 1]);
0105 
0106       // found hit corresponding to our cuda thread, now do the job
0107       auto mi = hh.detectorIndex(i);
0108       if (mi > gpuClustering::maxNumModules)
0109         continue;  // invalid
0110 
0111       /* maybe clever, not effective when zoCut is on
0112       auto bpos = (mi%8)/4;  // if barrel is 1 for z>0
0113       auto fpos = (outer>3) & (outer<7);
0114       if ( ((inner<3) & (outer>3)) && bpos!=fpos) continue;
0115       */
0116 
0117       auto mez = hh.zGlobal(i);
0118 
0119       if (mez < minz[pairLayerId] || mez > maxz[pairLayerId])
0120         continue;
0121 
0122       int16_t mes = -1;  // make compiler happy
0123       if (doClusterCut) {
0124         // if ideal treat inner ladder as outer
0125         if (inner == 0)
0126           assert(mi < 96);
0127         isOuterLadder = ideal_cond ? true : 0 == (mi / 8) % 2;  // only for B1/B2/B3 B4 is opposite, FPIX:noclue...
0128 
0129         // in any case we always test mes>0 ...
0130         mes = inner > 0 || isOuterLadder ? hh.clusterSizeY(i) : -1;
0131 
0132         if (inner == 0 && outer > 3)  // B1 and F1
0133           if (mes > 0 && mes < minYsizeB1)
0134             continue;                 // only long cluster  (5*8)
0135         if (inner == 1 && outer > 3)  // B2 and F1
0136           if (mes > 0 && mes < minYsizeB2)
0137             continue;
0138       }
0139       auto mep = hh.iphi(i);
0140       auto mer = hh.rGlobal(i);
0141 
0142       // all cuts: true if fails
0143       constexpr float z0cut = 12.f;      // cm
0144       constexpr float hardPtCut = 0.5f;  // GeV
0145       // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
0146       constexpr float minRadius = hardPtCut * 87.78f;
0147       constexpr float minRadius2T4 = 4.f * minRadius * minRadius;
0148       auto ptcut = [&](int j, int16_t idphi) {
0149         auto r2t4 = minRadius2T4;
0150         auto ri = mer;
0151         auto ro = hh.rGlobal(j);
0152         auto dphi = short2phi(idphi);
0153         return dphi * dphi * (r2t4 - ri * ro) > (ro - ri) * (ro - ri);
0154       };
0155       auto z0cutoff = [&](int j) {
0156         auto zo = hh.zGlobal(j);
0157         auto ro = hh.rGlobal(j);
0158         auto dr = ro - mer;
0159         return dr > maxr[pairLayerId] || dr < 0 || std::abs((mez * ro - mer * zo)) > z0cut * dr;
0160       };
0161 
0162       auto zsizeCut = [&](int j) {
0163         auto onlyBarrel = outer < 4;
0164         auto so = hh.clusterSizeY(j);
0165         auto dy = inner == 0 ? maxDYsize12 : maxDYsize;
0166         // in the barrel cut on difference in size
0167         // in the endcap on the prediction on the first layer (actually in the barrel only: happen to be safe for endcap as well)
0168         // FIXME move pred cut to z0cutoff to optmize loading of and computaiton ...
0169         auto zo = hh.zGlobal(j);
0170         auto ro = hh.rGlobal(j);
0171         return onlyBarrel ? mes > 0 && so > 0 && std::abs(so - mes) > dy
0172                           : (inner < 4) && mes > 0 &&
0173                                 std::abs(mes - int(std::abs((mez - zo) / (mer - ro)) * dzdrFact + 0.5f)) > maxDYPred;
0174       };
0175 
0176       auto iphicut = phicuts[pairLayerId];
0177 
0178       auto kl = PhiBinner::bin(int16_t(mep - iphicut));
0179       auto kh = PhiBinner::bin(int16_t(mep + iphicut));
0180       auto incr = [](auto& k) { return k = (k + 1) % PhiBinner::nbins(); };
0181 
0182 #ifdef GPU_DEBUG
0183       int tot = 0;
0184       int nmin = 0;
0185       int tooMany = 0;
0186 #endif
0187 
0188       auto khh = kh;
0189       incr(khh);
0190       for (auto kk = kl; kk != khh; incr(kk)) {
0191 #ifdef GPU_DEBUG
0192         if (kk != kl && kk != kh)
0193           nmin += phiBinner.size(kk + hoff);
0194 #endif
0195         auto const* __restrict__ p = phiBinner.begin(kk + hoff);
0196         auto const* __restrict__ e = phiBinner.end(kk + hoff);
0197         p += first;
0198         for (; p < e; p += stride) {
0199           auto oi = __ldg(p);
0200           assert(oi >= offsets[outer]);
0201           assert(oi < offsets[outer + 1]);
0202           auto mo = hh.detectorIndex(oi);
0203           if (mo > gpuClustering::maxNumModules)
0204             continue;  //    invalid
0205 
0206           if (doZ0Cut && z0cutoff(oi))
0207             continue;
0208 
0209           auto mop = hh.iphi(oi);
0210           uint16_t idphi = std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop)));
0211           if (idphi > iphicut)
0212             continue;
0213 
0214           if (doClusterCut && zsizeCut(oi))
0215             continue;
0216           if (doPtCut && ptcut(oi, idphi))
0217             continue;
0218 
0219           auto ind = atomicAdd(nCells, 1);
0220           if (ind >= maxNumOfDoublets) {
0221             atomicSub(nCells, 1);
0222             break;
0223           }  // move to SimpleVector??
0224           // int layerPairId, int doubletId, int innerHitId, int outerHitId)
0225           cells[ind].init(*cellNeighbors, *cellTracks, hh, pairLayerId, i, oi);
0226           isOuterHitOfCell[oi].push_back(ind);
0227 #ifdef GPU_DEBUG
0228           if (isOuterHitOfCell[oi].full())
0229             ++tooMany;
0230           ++tot;
0231 #endif
0232         }
0233       }
0234 #ifdef GPU_DEBUG
0235       if (tooMany > 0)
0236         printf("OuterHitOfCell full for %d in layer %d/%d, %d,%d %d\n", i, inner, outer, nmin, tot, tooMany);
0237 #endif
0238     }  // loop in block...
0239   }
0240 
0241 }  // namespace gpuPixelDoublets
0242 
0243 #endif  // RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoubletsAlgos_h