Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2021-12-24 02:22:16

0001 #ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelRecHits_h
0002 #define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelRecHits_h
0003 
0004 #include <cstdint>
0005 #include <cstdio>
0006 #include <limits>
0007 
0008 #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
0009 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
0010 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
0011 #include "DataFormats/Math/interface/approx_atan2.h"
0012 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0013 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
0014 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h"
0015 
0016 namespace gpuPixelRecHits {
0017 
0018   __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const* __restrict__ cpeParams,
0019                           BeamSpotPOD const* __restrict__ bs,
0020                           SiPixelDigisCUDASOAView const digis,
0021                           int numElements,
0022                           SiPixelClustersCUDA::SiPixelClustersCUDASOAView const* __restrict__ pclusters,
0023                           TrackingRecHit2DSOAView* phits) {
0024     // FIXME
0025     // the compiler seems NOT to optimize loads from views (even in a simple test case)
0026     // The whole gimnastic here of copying or not is a pure heuristic exercise that seems to produce the fastest code with the above signature
0027     // not using views (passing a gazzilion of array pointers) seems to produce the fastest code (but it is harder to mantain)
0028     assert(phits);
0029     assert(cpeParams);
0030     auto& hits = *phits;
0031 
0032     auto const& clusters = *pclusters;
0033     auto isPhase2 = cpeParams->commonParams().isPhase2;
0034     // copy average geometry corrected by beamspot . FIXME (move it somewhere else???)
0035     if (0 == blockIdx.x) {
0036       auto& agc = hits.averageGeometry();
0037       auto const& ag = cpeParams->averageGeometry();
0038       auto nLadders =
0039           isPhase2 ? phase2PixelTopology::numberOfLaddersInBarrel : phase1PixelTopology::numberOfLaddersInBarrel;
0040 
0041       for (int il = threadIdx.x, nl = nLadders; il < nl; il += blockDim.x) {
0042         agc.ladderZ[il] = ag.ladderZ[il] - bs->z;
0043         agc.ladderX[il] = ag.ladderX[il] - bs->x;
0044         agc.ladderY[il] = ag.ladderY[il] - bs->y;
0045         agc.ladderR[il] = sqrt(agc.ladderX[il] * agc.ladderX[il] + agc.ladderY[il] * agc.ladderY[il]);
0046         agc.ladderMinZ[il] = ag.ladderMinZ[il] - bs->z;
0047         agc.ladderMaxZ[il] = ag.ladderMaxZ[il] - bs->z;
0048       }
0049 
0050       if (0 == threadIdx.x) {
0051         agc.endCapZ[0] = ag.endCapZ[0] - bs->z;
0052         agc.endCapZ[1] = ag.endCapZ[1] - bs->z;
0053         //         printf("endcapZ %f %f\n",agc.endCapZ[0],agc.endCapZ[1]);
0054       }
0055     }
0056 
0057     // to be moved in common namespace...
0058     using gpuClustering::invalidModuleId;
0059     constexpr int32_t MaxHitsInIter = pixelCPEforGPU::MaxHitsInIter;
0060 
0061     using ClusParams = pixelCPEforGPU::ClusParams;
0062 
0063     // as usual one block per module
0064     __shared__ ClusParams clusParams;
0065 
0066     auto me = clusters.moduleId(blockIdx.x);
0067     int nclus = clusters.clusInModule(me);
0068 
0069     if (0 == nclus)
0070       return;
0071 #ifdef GPU_DEBUG
0072     if (threadIdx.x == 0) {
0073       auto k = clusters.moduleStart(1 + blockIdx.x);
0074       while (digis.moduleInd(k) == invalidModuleId)
0075         ++k;
0076       assert(digis.moduleInd(k) == me);
0077     }
0078 #endif
0079 #ifdef GPU_DEBUG
0080     if (me % 100 == 1)
0081       if (threadIdx.x == 0)
0082         printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters.clusModuleStart(me));
0083 #endif
0084     for (int startClus = 0, endClus = nclus; startClus < endClus; startClus += MaxHitsInIter) {
0085       int nClusInIter = std::min(MaxHitsInIter, endClus - startClus);
0086       int lastClus = startClus + nClusInIter;
0087       assert(nClusInIter <= nclus);
0088       assert(nClusInIter > 0);
0089       assert(lastClus <= nclus);
0090 
0091       assert(nclus > MaxHitsInIter || (0 == startClus && nClusInIter == nclus && lastClus == nclus));
0092 
0093       // init
0094       for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
0095         clusParams.minRow[ic] = std::numeric_limits<uint32_t>::max();
0096         clusParams.maxRow[ic] = 0;
0097         clusParams.minCol[ic] = std::numeric_limits<uint32_t>::max();
0098         clusParams.maxCol[ic] = 0;
0099         clusParams.charge[ic] = 0;
0100         clusParams.q_f_X[ic] = 0;
0101         clusParams.q_l_X[ic] = 0;
0102         clusParams.q_f_Y[ic] = 0;
0103         clusParams.q_l_Y[ic] = 0;
0104       }
0105 
0106       __syncthreads();
0107 
0108       // one thread per "digi"
0109       auto first = clusters.moduleStart(1 + blockIdx.x) + threadIdx.x;
0110       for (int i = first; i < numElements; i += blockDim.x) {
0111         auto id = digis.moduleInd(i);
0112         if (id == invalidModuleId)
0113           continue;  // not valid
0114         if (id != me)
0115           break;  // end of module
0116         auto cl = digis.clus(i);
0117         if (cl < startClus || cl >= lastClus)
0118           continue;
0119         cl -= startClus;
0120         assert(cl >= 0);
0121         assert(cl < MaxHitsInIter);
0122         auto x = digis.xx(i);
0123         auto y = digis.yy(i);
0124         atomicMin(&clusParams.minRow[cl], x);
0125         atomicMax(&clusParams.maxRow[cl], x);
0126         atomicMin(&clusParams.minCol[cl], y);
0127         atomicMax(&clusParams.maxCol[cl], y);
0128       }
0129 
0130       __syncthreads();
0131 
0132       auto pixmx = cpeParams->detParams(me).pixmx;
0133       for (int i = first; i < numElements; i += blockDim.x) {
0134         auto id = digis.moduleInd(i);
0135         if (id == invalidModuleId)
0136           continue;  // not valid
0137         if (id != me)
0138           break;  // end of module
0139         auto cl = digis.clus(i);
0140         if (cl < startClus || cl >= lastClus)
0141           continue;
0142         cl -= startClus;
0143         assert(cl >= 0);
0144         assert(cl < MaxHitsInIter);
0145         auto x = digis.xx(i);
0146         auto y = digis.yy(i);
0147         auto ch = digis.adc(i);
0148         atomicAdd(&clusParams.charge[cl], ch);
0149         ch = std::min(ch, pixmx);
0150         if (clusParams.minRow[cl] == x)
0151           atomicAdd(&clusParams.q_f_X[cl], ch);
0152         if (clusParams.maxRow[cl] == x)
0153           atomicAdd(&clusParams.q_l_X[cl], ch);
0154         if (clusParams.minCol[cl] == y)
0155           atomicAdd(&clusParams.q_f_Y[cl], ch);
0156         if (clusParams.maxCol[cl] == y)
0157           atomicAdd(&clusParams.q_l_Y[cl], ch);
0158       }
0159 
0160       __syncthreads();
0161 
0162       // next one cluster per thread...
0163 
0164       first = clusters.clusModuleStart(me) + startClus;
0165       for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
0166         auto h = first + ic;  // output index in global memory
0167 
0168         assert(h < hits.nHits());
0169         assert(h < clusters.clusModuleStart(me + 1));
0170 
0171         pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
0172         if (!isPhase2)
0173           pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
0174         else
0175           pixelCPEforGPU::errorFromSize(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
0176 
0177         // store it
0178         hits.setChargeAndStatus(h, clusParams.charge[ic], clusParams.status[ic]);
0179         hits.detectorIndex(h) = me;
0180 
0181         float xl, yl;
0182         hits.xLocal(h) = xl = clusParams.xpos[ic];
0183         hits.yLocal(h) = yl = clusParams.ypos[ic];
0184 
0185         hits.clusterSizeX(h) = clusParams.xsize[ic];
0186         hits.clusterSizeY(h) = clusParams.ysize[ic];
0187 
0188         hits.xerrLocal(h) = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX;
0189         hits.yerrLocal(h) = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY;
0190 
0191         // keep it local for computations
0192         float xg, yg, zg;
0193         // to global and compute phi...
0194         cpeParams->detParams(me).frame.toGlobal(xl, yl, xg, yg, zg);
0195         // here correct for the beamspot...
0196         xg -= bs->x;
0197         yg -= bs->y;
0198         zg -= bs->z;
0199 
0200         hits.xGlobal(h) = xg;
0201         hits.yGlobal(h) = yg;
0202         hits.zGlobal(h) = zg;
0203 
0204         hits.rGlobal(h) = std::sqrt(xg * xg + yg * yg);
0205         hits.iphi(h) = unsafe_atan2s<7>(yg, xg);
0206       }
0207       __syncthreads();
0208     }  // end loop on batches
0209   }
0210 
0211 }  // namespace gpuPixelRecHits
0212 
0213 #endif  // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelRecHits_h