Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2021-05-17 03:00:32

0001 #ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
0002 #define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
0003 
0004 #include <algorithm>
0005 #include <cmath>
0006 #include <cstdint>
0007 
0008 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0010 
0011 #include "gpuVertexFinder.h"
0012 
0013 namespace gpuVertexFinder {
0014 
0015   // this algo does not really scale as it works in a single block...
0016   // enough for <10K tracks we have
0017   __global__ void clusterTracksIterative(ZVertices* pdata,
0018                                          WorkSpace* pws,
0019                                          int minT,      // min number of neighbours to be "core"
0020                                          float eps,     // max absolute distance to cluster
0021                                          float errmax,  // max error to be "seed"
0022                                          float chi2max  // max normalized distance to cluster
0023   ) {
0024     constexpr bool verbose = false;  // in principle the compiler should optmize out if false
0025 
0026     if (verbose && 0 == threadIdx.x)
0027       printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
0028 
0029     auto er2mx = errmax * errmax;
0030 
0031     auto& __restrict__ data = *pdata;
0032     auto& __restrict__ ws = *pws;
0033     auto nt = ws.ntrks;
0034     float const* __restrict__ zt = ws.zt;
0035     float const* __restrict__ ezt2 = ws.ezt2;
0036 
0037     uint32_t& nvFinal = data.nvFinal;
0038     uint32_t& nvIntermediate = ws.nvIntermediate;
0039 
0040     uint8_t* __restrict__ izt = ws.izt;
0041     int32_t* __restrict__ nn = data.ndof;
0042     int32_t* __restrict__ iv = ws.iv;
0043 
0044     assert(pdata);
0045     assert(zt);
0046 
0047     using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
0048     __shared__ Hist hist;
0049     __shared__ typename Hist::Counter hws[32];
0050     for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
0051       hist.off[j] = 0;
0052     }
0053     __syncthreads();
0054 
0055     if (verbose && 0 == threadIdx.x)
0056       printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
0057 
0058     assert((int)nt <= hist.capacity());
0059 
0060     // fill hist  (bin shall be wider than "eps")
0061     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0062       assert(i < ZVertices::MAXTRACKS);
0063       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
0064       iz = std::clamp(iz, INT8_MIN, INT8_MAX);
0065       izt[i] = iz - INT8_MIN;
0066       assert(iz - INT8_MIN >= 0);
0067       assert(iz - INT8_MIN < 256);
0068       hist.count(izt[i]);
0069       iv[i] = i;
0070       nn[i] = 0;
0071     }
0072     __syncthreads();
0073     if (threadIdx.x < 32)
0074       hws[threadIdx.x] = 0;  // used by prefix scan...
0075     __syncthreads();
0076     hist.finalize(hws);
0077     __syncthreads();
0078     assert(hist.size() == nt);
0079     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0080       hist.fill(izt[i], uint16_t(i));
0081     }
0082     __syncthreads();
0083 
0084     // count neighbours
0085     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0086       if (ezt2[i] > er2mx)
0087         continue;
0088       auto loop = [&](uint32_t j) {
0089         if (i == j)
0090           return;
0091         auto dist = std::abs(zt[i] - zt[j]);
0092         if (dist > eps)
0093           return;
0094         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0095           return;
0096         nn[i]++;
0097       };
0098 
0099       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0100     }
0101 
0102     __shared__ int nloops;
0103     nloops = 0;
0104 
0105     __syncthreads();
0106 
0107     // cluster seeds only
0108     bool more = true;
0109     while (__syncthreads_or(more)) {
0110       if (1 == nloops % 2) {
0111         for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0112           auto m = iv[i];
0113           while (m != iv[m])
0114             m = iv[m];
0115           iv[i] = m;
0116         }
0117       } else {
0118         more = false;
0119         for (auto k = threadIdx.x; k < hist.size(); k += blockDim.x) {
0120           auto p = hist.begin() + k;
0121           auto i = (*p);
0122           auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1));
0123           if (nn[i] < minT)
0124             continue;  // DBSCAN core rule
0125           auto loop = [&](uint32_t j) {
0126             assert(i != j);
0127             if (nn[j] < minT)
0128               return;  // DBSCAN core rule
0129             auto dist = std::abs(zt[i] - zt[j]);
0130             if (dist > eps)
0131               return;
0132             if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0133               return;
0134             auto old = atomicMin(&iv[j], iv[i]);
0135             if (old != iv[i]) {
0136               // end the loop only if no changes were applied
0137               more = true;
0138             }
0139             atomicMin(&iv[i], old);
0140           };
0141           ++p;
0142           for (; p < hist.end(be); ++p)
0143             loop(*p);
0144         }  // for i
0145       }
0146       if (threadIdx.x == 0)
0147         ++nloops;
0148     }  // while
0149 
0150     // collect edges (assign to closest cluster of closest point??? here to closest point)
0151     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0152       //    if (nn[i]==0 || nn[i]>=minT) continue;    // DBSCAN edge rule
0153       if (nn[i] >= minT)
0154         continue;  // DBSCAN edge rule
0155       float mdist = eps;
0156       auto loop = [&](int j) {
0157         if (nn[j] < minT)
0158           return;  // DBSCAN core rule
0159         auto dist = std::abs(zt[i] - zt[j]);
0160         if (dist > mdist)
0161           return;
0162         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0163           return;  // needed?
0164         mdist = dist;
0165         iv[i] = iv[j];  // assign to cluster (better be unique??)
0166       };
0167       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0168     }
0169 
0170     __shared__ unsigned int foundClusters;
0171     foundClusters = 0;
0172     __syncthreads();
0173 
0174     // find the number of different clusters, identified by a tracks with clus[i] == i;
0175     // mark these tracks with a negative id.
0176     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0177       if (iv[i] == int(i)) {
0178         if (nn[i] >= minT) {
0179           auto old = atomicInc(&foundClusters, 0xffffffff);
0180           iv[i] = -(old + 1);
0181         } else {  // noise
0182           iv[i] = -9998;
0183         }
0184       }
0185     }
0186     __syncthreads();
0187 
0188     assert(foundClusters < ZVertices::MAXVTX);
0189 
0190     // propagate the negative id to all the tracks in the cluster.
0191     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0192       if (iv[i] >= 0) {
0193         // mark each track in a cluster with the same id as the first one
0194         iv[i] = iv[iv[i]];
0195       }
0196     }
0197     __syncthreads();
0198 
0199     // adjust the cluster id to be a positive value starting from 0
0200     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0201       iv[i] = -iv[i] - 1;
0202     }
0203 
0204     nvIntermediate = nvFinal = foundClusters;
0205 
0206     if (verbose && 0 == threadIdx.x)
0207       printf("found %d proto vertices\n", foundClusters);
0208   }
0209 
0210 }  // namespace gpuVertexFinder
0211 
0212 #endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h