Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2022-03-29 07:47:34

0001 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0002 
0003 #include "gpuClusterTracksByDensity.h"
0004 #include "gpuClusterTracksDBSCAN.h"
0005 #include "gpuClusterTracksIterative.h"
0006 #include "gpuFitVertices.h"
0007 #include "gpuSortByPt2.h"
0008 #include "gpuSplitVertices.h"
0009 
0010 #undef PIXVERTEX_DEBUG_PRODUCE
0011 
0012 namespace gpuVertexFinder {
0013 
0014   // reject outlier tracks that contribute more than this to the chi2 of the vertex fit
0015   constexpr float maxChi2ForFirstFit = 50.f;
0016   constexpr float maxChi2ForFinalFit = 5000.f;
0017 
0018   // split vertices with a chi2/NDoF greater than this
0019   constexpr float maxChi2ForSplit = 9.f;
0020 
0021   __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) {
0022     assert(ptracks);
0023     assert(soa);
0024     auto const& tracks = *ptracks;
0025     auto const& fit = tracks.stateAtBS;
0026     auto const* quality = tracks.qualityData();
0027 
0028     auto first = blockIdx.x * blockDim.x + threadIdx.x;
0029     for (int idx = first, nt = tracks.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) {
0030       auto nHits = tracks.nHits(idx);
0031       assert(nHits >= 3);
0032 
0033       // initialize soa...
0034       soa->idv[idx] = -1;
0035 
0036       if (tracks.isTriplet(idx))
0037         continue;  // no triplets
0038       if (quality[idx] < pixelTrack::Quality::highPurity)
0039         continue;
0040 
0041       auto pt = tracks.pt(idx);
0042 
0043       if (pt < ptMin)
0044         continue;
0045 
0046       // clamp pt
0047       pt = std::min(pt, ptMax);
0048 
0049       auto& data = *pws;
0050       auto it = atomicAdd(&data.ntrks, 1);
0051       data.itrk[it] = idx;
0052       data.zt[it] = tracks.zip(idx);
0053       data.ezt2[it] = fit.covariance(idx)(14);
0054       data.ptt2[it] = pt * pt;
0055     }
0056   }
0057 
0058 // #define THREE_KERNELS
0059 #ifndef THREE_KERNELS
0060   __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
0061                                         gpuVertexFinder::WorkSpace* pws,
0062                                         int minT,      // min number of neighbours to be "seed"
0063                                         float eps,     // max absolute distance to cluster
0064                                         float errmax,  // max error to be "seed"
0065                                         float chi2max  // max normalized distance to cluster,
0066   ) {
0067     clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
0068     __syncthreads();
0069     fitVertices(pdata, pws, maxChi2ForFirstFit);
0070     __syncthreads();
0071     splitVertices(pdata, pws, maxChi2ForSplit);
0072     __syncthreads();
0073     fitVertices(pdata, pws, maxChi2ForFinalFit);
0074     __syncthreads();
0075     sortByPt2(pdata, pws);
0076   }
0077 #else
0078   __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata,
0079                                       gpuVertexFinder::WorkSpace* pws,
0080                                       int minT,      // min number of neighbours to be "seed"
0081                                       float eps,     // max absolute distance to cluster
0082                                       float errmax,  // max error to be "seed"
0083                                       float chi2max  // max normalized distance to cluster,
0084   ) {
0085     clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
0086     __syncthreads();
0087     fitVertices(pdata, pws, maxChi2ForFirstFit);
0088   }
0089 
0090   __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) {
0091     fitVertices(pdata, pws, maxChi2ForFinalFit);
0092     __syncthreads();
0093     sortByPt2(pdata, pws);
0094   }
0095 #endif
0096 
0097 #ifdef __CUDACC__
0098   ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin, float ptMax) const {
0099 #ifdef PIXVERTEX_DEBUG_PRODUCE
0100     std::cout << "producing Vertices on GPU" << std::endl;
0101 #endif  // PIXVERTEX_DEBUG_PRODUCE
0102     ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
0103 #else
0104   ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin, float ptMax) const {
0105 #ifdef PIXVERTEX_DEBUG_PRODUCE
0106     std::cout << "producing Vertices on  CPU" << std::endl;
0107 #endif  // PIXVERTEX_DEBUG_PRODUCE
0108     ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
0109 #endif
0110     assert(tksoa);
0111     auto* soa = vertices.get();
0112     assert(soa);
0113 
0114 #ifdef __CUDACC__
0115     auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream);
0116 #else
0117     auto ws_d = std::make_unique<WorkSpace>();
0118 #endif
0119 
0120 #ifdef __CUDACC__
0121     init<<<1, 1, 0, stream>>>(soa, ws_d.get());
0122     auto blockSize = 128;
0123     auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
0124     loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin, ptMax);
0125     cudaCheck(cudaGetLastError());
0126 #else
0127     init(soa, ws_d.get());
0128     loadTracks(tksoa, soa, ws_d.get(), ptMin, ptMax);
0129 #endif
0130 
0131 #ifdef __CUDACC__
0132     // Running too many thread lead to problems when printf is enabled.
0133     constexpr int maxThreadsForPrint = 1024 - 128;
0134     constexpr int numBlocks = 1024;
0135     constexpr int threadsPerBlock = 128;
0136 
0137     if (oneKernel_) {
0138       // implemented only for density clustesrs
0139 #ifndef THREE_KERNELS
0140       vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
0141 #else
0142       vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
0143       cudaCheck(cudaGetLastError());
0144       // one block per vertex...
0145       splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
0146       cudaCheck(cudaGetLastError());
0147       vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
0148 #endif
0149     } else {  // five kernels
0150       if (useDensity_) {
0151         clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
0152       } else if (useDBSCAN_) {
0153         clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
0154       } else if (useIterative_) {
0155         clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
0156       }
0157       cudaCheck(cudaGetLastError());
0158       fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit);
0159       cudaCheck(cudaGetLastError());
0160       // one block per vertex...
0161       splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
0162       cudaCheck(cudaGetLastError());
0163       fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit);
0164       cudaCheck(cudaGetLastError());
0165       sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
0166     }
0167     cudaCheck(cudaGetLastError());
0168 #else  // __CUDACC__
0169     if (useDensity_) {
0170       clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);
0171     } else if (useDBSCAN_) {
0172       clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max);
0173     } else if (useIterative_) {
0174       clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max);
0175     }
0176 #ifdef PIXVERTEX_DEBUG_PRODUCE
0177     std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
0178 #endif  // PIXVERTEX_DEBUG_PRODUCE
0179     fitVertices(soa, ws_d.get(), maxChi2ForFirstFit);
0180     // one block per vertex!
0181     splitVertices(soa, ws_d.get(), maxChi2ForSplit);
0182     fitVertices(soa, ws_d.get(), maxChi2ForFinalFit);
0183     sortByPt2(soa, ws_d.get());
0184 #endif
0185 
0186     return vertices;
0187   }
0188 
0189 }  // namespace gpuVertexFinder