Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2023-10-25 10:00:38

0001 #ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelRecHits_h
0002 #define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelRecHits_h
0003 
0004 #include <cstdint>
0005 #include <cstdio>
0006 #include <limits>
0007 
0008 #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
0009 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
0010 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
0011 #include "DataFormats/Math/interface/approx_atan2.h"
0012 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0013 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
0014 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
0015 
0016 //#define GPU_DEBUG 1
0017 namespace gpuPixelRecHits {
0018 
0019   template <typename TrackerTraits>
0020   __global__ void getHits(pixelCPEforGPU::ParamsOnGPUT<TrackerTraits> const* __restrict__ cpeParams,
0021                           BeamSpotPOD const* __restrict__ bs,
0022                           SiPixelDigisCUDASOAConstView digis,
0023                           int numElements,
0024                           SiPixelClustersCUDASOAConstView clusters,
0025                           TrackingRecHitSoAView<TrackerTraits> hits) {
0026     // FIXME
0027     // the compiler seems NOT to optimize loads from views (even in a simple test case)
0028     // The whole gimnastic here of copying or not is a pure heuristic exercise that seems to produce the fastest code with the above signature
0029     // not using views (passing a gazzilion of array pointers) seems to produce the fastest code (but it is harder to mantain)
0030 
0031     assert(cpeParams);
0032 
0033     // copy average geometry corrected by beamspot . FIXME (move it somewhere else???)
0034     if (0 == blockIdx.x) {
0035       auto& agc = hits.averageGeometry();
0036       auto const& ag = cpeParams->averageGeometry();
0037       auto nLadders = TrackerTraits::numberOfLaddersInBarrel;
0038 
0039       for (int il = threadIdx.x, nl = nLadders; il < nl; il += blockDim.x) {
0040         agc.ladderZ[il] = ag.ladderZ[il] - bs->z;
0041         agc.ladderX[il] = ag.ladderX[il] - bs->x;
0042         agc.ladderY[il] = ag.ladderY[il] - bs->y;
0043         agc.ladderR[il] = sqrt(agc.ladderX[il] * agc.ladderX[il] + agc.ladderY[il] * agc.ladderY[il]);
0044         agc.ladderMinZ[il] = ag.ladderMinZ[il] - bs->z;
0045         agc.ladderMaxZ[il] = ag.ladderMaxZ[il] - bs->z;
0046       }
0047 
0048       if (0 == threadIdx.x) {
0049         agc.endCapZ[0] = ag.endCapZ[0] - bs->z;
0050         agc.endCapZ[1] = ag.endCapZ[1] - bs->z;
0051       }
0052     }
0053 
0054     // to be moved in common namespace...
0055     using gpuClustering::invalidModuleId;
0056     constexpr int32_t MaxHitsInIter = pixelCPEforGPU::MaxHitsInIter;
0057 
0058     using ClusParams = pixelCPEforGPU::ClusParams;
0059 
0060     // as usual one block per module
0061     __shared__ ClusParams clusParams;
0062 
0063     auto me = clusters[blockIdx.x].moduleId();
0064     int nclus = clusters[me].clusInModule();
0065 
0066     if (0 == nclus)
0067       return;
0068 #ifdef GPU_DEBUG
0069     if (threadIdx.x == 0) {
0070       auto k = clusters[1 + blockIdx.x].moduleStart();
0071       while (digis[k].moduleId() == invalidModuleId)
0072         ++k;
0073       assert(digis[k].moduleId() == me);
0074     }
0075 
0076     if (me % 100 == 1)
0077       if (threadIdx.x == 0)
0078         printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters[me].clusModuleStart());
0079 #endif
0080 
0081     for (int startClus = 0, endClus = nclus; startClus < endClus; startClus += MaxHitsInIter) {
0082       int nClusInIter = std::min(MaxHitsInIter, endClus - startClus);
0083       int lastClus = startClus + nClusInIter;
0084       assert(nClusInIter <= nclus);
0085       assert(nClusInIter > 0);
0086       assert(lastClus <= nclus);
0087 
0088       assert(nclus > MaxHitsInIter || (0 == startClus && nClusInIter == nclus && lastClus == nclus));
0089 
0090       // init
0091       for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
0092         clusParams.minRow[ic] = std::numeric_limits<uint32_t>::max();
0093         clusParams.maxRow[ic] = 0;
0094         clusParams.minCol[ic] = std::numeric_limits<uint32_t>::max();
0095         clusParams.maxCol[ic] = 0;
0096         clusParams.charge[ic] = 0;
0097         clusParams.q_f_X[ic] = 0;
0098         clusParams.q_l_X[ic] = 0;
0099         clusParams.q_f_Y[ic] = 0;
0100         clusParams.q_l_Y[ic] = 0;
0101       }
0102 
0103       __syncthreads();
0104 
0105       // one thread per "digi"
0106       auto first = clusters[1 + blockIdx.x].moduleStart() + threadIdx.x;
0107       for (int i = first; i < numElements; i += blockDim.x) {
0108         auto id = digis[i].moduleId();
0109         if (id == invalidModuleId)
0110           continue;  // not valid
0111         if (id != me)
0112           break;  // end of module
0113         auto cl = digis[i].clus();
0114         if (cl < startClus || cl >= lastClus)
0115           continue;
0116         cl -= startClus;
0117         assert(cl >= 0);
0118         assert(cl < MaxHitsInIter);
0119         auto x = digis[i].xx();
0120         auto y = digis[i].yy();
0121         atomicMin(&clusParams.minRow[cl], x);
0122         atomicMax(&clusParams.maxRow[cl], x);
0123         atomicMin(&clusParams.minCol[cl], y);
0124         atomicMax(&clusParams.maxCol[cl], y);
0125       }
0126 
0127       __syncthreads();
0128 
0129       auto pixmx = cpeParams->detParams(me).pixmx;
0130       for (int i = first; i < numElements; i += blockDim.x) {
0131         auto id = digis[i].moduleId();
0132         if (id == invalidModuleId)
0133           continue;  // not valid
0134         if (id != me)
0135           break;  // end of module
0136         auto cl = digis[i].clus();
0137         if (cl < startClus || cl >= lastClus)
0138           continue;
0139         cl -= startClus;
0140         assert(cl >= 0);
0141         assert(cl < MaxHitsInIter);
0142         auto x = digis[i].xx();
0143         auto y = digis[i].yy();
0144         auto ch = digis[i].adc();
0145         atomicAdd(&clusParams.charge[cl], ch);
0146         ch = std::min(ch, pixmx);
0147         if (clusParams.minRow[cl] == x)
0148           atomicAdd(&clusParams.q_f_X[cl], ch);
0149         if (clusParams.maxRow[cl] == x)
0150           atomicAdd(&clusParams.q_l_X[cl], ch);
0151         if (clusParams.minCol[cl] == y)
0152           atomicAdd(&clusParams.q_f_Y[cl], ch);
0153         if (clusParams.maxCol[cl] == y)
0154           atomicAdd(&clusParams.q_l_Y[cl], ch);
0155       }
0156 
0157       __syncthreads();
0158 
0159       // next one cluster per thread...
0160 
0161       first = clusters[me].clusModuleStart() + startClus;
0162       for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
0163         auto h = first + ic;  // output index in global memory
0164 
0165         assert(h < hits.nHits());
0166         assert(h < clusters[me + 1].clusModuleStart());
0167 
0168         pixelCPEforGPU::position<TrackerTraits>(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
0169 
0170         pixelCPEforGPU::errorFromDB<TrackerTraits>(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
0171 
0172         // store it
0173         hits[h].chargeAndStatus().charge = clusParams.charge[ic];
0174         hits[h].chargeAndStatus().status = clusParams.status[ic];
0175         hits[h].detectorIndex() = me;
0176 
0177         float xl, yl;
0178         hits[h].xLocal() = xl = clusParams.xpos[ic];
0179         hits[h].yLocal() = yl = clusParams.ypos[ic];
0180 
0181         hits[h].clusterSizeX() = clusParams.xsize[ic];
0182         hits[h].clusterSizeY() = clusParams.ysize[ic];
0183 
0184         hits[h].xerrLocal() = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX;
0185         hits[h].yerrLocal() = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY;
0186 
0187         // keep it local for computations
0188         float xg, yg, zg;
0189         // to global and compute phi...
0190         cpeParams->detParams(me).frame.toGlobal(xl, yl, xg, yg, zg);
0191         // here correct for the beamspot...
0192         xg -= bs->x;
0193         yg -= bs->y;
0194         zg -= bs->z;
0195 
0196         hits[h].xGlobal() = xg;
0197         hits[h].yGlobal() = yg;
0198         hits[h].zGlobal() = zg;
0199 
0200         hits[h].rGlobal() = std::sqrt(xg * xg + yg * yg);
0201         hits[h].iphi() = unsafe_atan2s<7>(yg, xg);
0202       }
0203       __syncthreads();
0204     }  // end loop on batches
0205   }
0206 
0207 }  // namespace gpuPixelRecHits
0208 
0209 #endif  // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelRecHits_h