Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2021-05-17 03:00:32

0001 #ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
0002 #define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
0003 
0004 #include <algorithm>
0005 #include <cmath>
0006 #include <cstdint>
0007 
0008 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0010 
0011 #include "gpuVertexFinder.h"
0012 
0013 namespace gpuVertexFinder {
0014 
0015   // this algo does not really scale as it works in a single block...
0016   // enough for <10K tracks we have
0017   //
0018   // based on Rodrighez&Laio algo
0019   //
0020   __device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata,
0021                                                          gpuVertexFinder::WorkSpace* pws,
0022                                                          int minT,      // min number of neighbours to be "seed"
0023                                                          float eps,     // max absolute distance to cluster
0024                                                          float errmax,  // max error to be "seed"
0025                                                          float chi2max  // max normalized distance to cluster
0026   ) {
0027     using namespace gpuVertexFinder;
0028     constexpr bool verbose = false;  // in principle the compiler should optmize out if false
0029 
0030     if (verbose && 0 == threadIdx.x)
0031       printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
0032 
0033     auto er2mx = errmax * errmax;
0034 
0035     auto& __restrict__ data = *pdata;
0036     auto& __restrict__ ws = *pws;
0037     auto nt = ws.ntrks;
0038     float const* __restrict__ zt = ws.zt;
0039     float const* __restrict__ ezt2 = ws.ezt2;
0040 
0041     uint32_t& nvFinal = data.nvFinal;
0042     uint32_t& nvIntermediate = ws.nvIntermediate;
0043 
0044     uint8_t* __restrict__ izt = ws.izt;
0045     int32_t* __restrict__ nn = data.ndof;
0046     int32_t* __restrict__ iv = ws.iv;
0047 
0048     assert(pdata);
0049     assert(zt);
0050 
0051     using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
0052     __shared__ Hist hist;
0053     __shared__ typename Hist::Counter hws[32];
0054     for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
0055       hist.off[j] = 0;
0056     }
0057     __syncthreads();
0058 
0059     if (verbose && 0 == threadIdx.x)
0060       printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
0061 
0062     assert((int)nt <= hist.capacity());
0063 
0064     // fill hist  (bin shall be wider than "eps")
0065     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0066       assert(i < ZVertices::MAXTRACKS);
0067       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
0068       // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
0069       iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
0070       izt[i] = iz - INT8_MIN;
0071       assert(iz - INT8_MIN >= 0);
0072       assert(iz - INT8_MIN < 256);
0073       hist.count(izt[i]);
0074       iv[i] = i;
0075       nn[i] = 0;
0076     }
0077     __syncthreads();
0078     if (threadIdx.x < 32)
0079       hws[threadIdx.x] = 0;  // used by prefix scan...
0080     __syncthreads();
0081     hist.finalize(hws);
0082     __syncthreads();
0083     assert(hist.size() == nt);
0084     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0085       hist.fill(izt[i], uint16_t(i));
0086     }
0087     __syncthreads();
0088 
0089     // count neighbours
0090     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0091       if (ezt2[i] > er2mx)
0092         continue;
0093       auto loop = [&](uint32_t j) {
0094         if (i == j)
0095           return;
0096         auto dist = std::abs(zt[i] - zt[j]);
0097         if (dist > eps)
0098           return;
0099         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0100           return;
0101         nn[i]++;
0102       };
0103 
0104       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0105     }
0106 
0107     __syncthreads();
0108 
0109     // find closest above me .... (we ignore the possibility of two j at same distance from i)
0110     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0111       float mdist = eps;
0112       auto loop = [&](uint32_t j) {
0113         if (nn[j] < nn[i])
0114           return;
0115         if (nn[j] == nn[i] && zt[j] >= zt[i])
0116           return;  // if equal use natural order...
0117         auto dist = std::abs(zt[i] - zt[j]);
0118         if (dist > mdist)
0119           return;
0120         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0121           return;  // (break natural order???)
0122         mdist = dist;
0123         iv[i] = j;  // assign to cluster (better be unique??)
0124       };
0125       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0126     }
0127 
0128     __syncthreads();
0129 
0130 #ifdef GPU_DEBUG
0131     //  mini verification
0132     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0133       if (iv[i] != int(i))
0134         assert(iv[iv[i]] != int(i));
0135     }
0136     __syncthreads();
0137 #endif
0138 
0139     // consolidate graph (percolate index of seed)
0140     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0141       auto m = iv[i];
0142       while (m != iv[m])
0143         m = iv[m];
0144       iv[i] = m;
0145     }
0146 
0147 #ifdef GPU_DEBUG
0148     __syncthreads();
0149     //  mini verification
0150     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0151       if (iv[i] != int(i))
0152         assert(iv[iv[i]] != int(i));
0153     }
0154 #endif
0155 
0156 #ifdef GPU_DEBUG
0157     // and verify that we did not spit any cluster...
0158     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0159       auto minJ = i;
0160       auto mdist = eps;
0161       auto loop = [&](uint32_t j) {
0162         if (nn[j] < nn[i])
0163           return;
0164         if (nn[j] == nn[i] && zt[j] >= zt[i])
0165           return;  // if equal use natural order...
0166         auto dist = std::abs(zt[i] - zt[j]);
0167         if (dist > mdist)
0168           return;
0169         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0170           return;
0171         mdist = dist;
0172         minJ = j;
0173       };
0174       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0175       // should belong to the same cluster...
0176       assert(iv[i] == iv[minJ]);
0177       assert(nn[i] <= nn[iv[i]]);
0178     }
0179     __syncthreads();
0180 #endif
0181 
0182     __shared__ unsigned int foundClusters;
0183     foundClusters = 0;
0184     __syncthreads();
0185 
0186     // find the number of different clusters, identified by a tracks with clus[i] == i and density larger than threshold;
0187     // mark these tracks with a negative id.
0188     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0189       if (iv[i] == int(i)) {
0190         if (nn[i] >= minT) {
0191           auto old = atomicInc(&foundClusters, 0xffffffff);
0192           iv[i] = -(old + 1);
0193         } else {  // noise
0194           iv[i] = -9998;
0195         }
0196       }
0197     }
0198     __syncthreads();
0199 
0200     assert(foundClusters < ZVertices::MAXVTX);
0201 
0202     // propagate the negative id to all the tracks in the cluster.
0203     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0204       if (iv[i] >= 0) {
0205         // mark each track in a cluster with the same id as the first one
0206         iv[i] = iv[iv[i]];
0207       }
0208     }
0209     __syncthreads();
0210 
0211     // adjust the cluster id to be a positive value starting from 0
0212     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0213       iv[i] = -iv[i] - 1;
0214     }
0215 
0216     nvIntermediate = nvFinal = foundClusters;
0217 
0218     if (verbose && 0 == threadIdx.x)
0219       printf("found %d proto vertices\n", foundClusters);
0220   }
0221 
0222   __global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata,
0223                                                gpuVertexFinder::WorkSpace* pws,
0224                                                int minT,      // min number of neighbours to be "seed"
0225                                                float eps,     // max absolute distance to cluster
0226                                                float errmax,  // max error to be "seed"
0227                                                float chi2max  // max normalized distance to cluster
0228   ) {
0229     clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
0230   }
0231 
0232 }  // namespace gpuVertexFinder
0233 
0234 #endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h