Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2021-05-17 03:00:32

0001 #ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
0002 #define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
0003 
0004 #include <algorithm>
0005 #include <cmath>
0006 #include <cstdint>
0007 
0008 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0010 
0011 #include "gpuVertexFinder.h"
0012 
0013 namespace gpuVertexFinder {
0014 
0015   // this algo does not really scale as it works in a single block...
0016   // enough for <10K tracks we have
0017   __global__ void clusterTracksDBSCAN(ZVertices* pdata,
0018                                       WorkSpace* pws,
0019                                       int minT,      // min number of neighbours to be "core"
0020                                       float eps,     // max absolute distance to cluster
0021                                       float errmax,  // max error to be "seed"
0022                                       float chi2max  // max normalized distance to cluster
0023   ) {
0024     constexpr bool verbose = false;  // in principle the compiler should optmize out if false
0025 
0026     if (verbose && 0 == threadIdx.x)
0027       printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
0028 
0029     auto er2mx = errmax * errmax;
0030 
0031     auto& __restrict__ data = *pdata;
0032     auto& __restrict__ ws = *pws;
0033     auto nt = ws.ntrks;
0034     float const* __restrict__ zt = ws.zt;
0035     float const* __restrict__ ezt2 = ws.ezt2;
0036 
0037     uint32_t& nvFinal = data.nvFinal;
0038     uint32_t& nvIntermediate = ws.nvIntermediate;
0039 
0040     uint8_t* __restrict__ izt = ws.izt;
0041     int32_t* __restrict__ nn = data.ndof;
0042     int32_t* __restrict__ iv = ws.iv;
0043 
0044     assert(pdata);
0045     assert(zt);
0046 
0047     using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
0048     __shared__ Hist hist;
0049     __shared__ typename Hist::Counter hws[32];
0050     for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
0051       hist.off[j] = 0;
0052     }
0053     __syncthreads();
0054 
0055     if (verbose && 0 == threadIdx.x)
0056       printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
0057 
0058     assert((int)nt <= hist.capacity());
0059 
0060     // fill hist  (bin shall be wider than "eps")
0061     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0062       assert(i < ZVertices::MAXTRACKS);
0063       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
0064       iz = std::clamp(iz, INT8_MIN, INT8_MAX);
0065       izt[i] = iz - INT8_MIN;
0066       assert(iz - INT8_MIN >= 0);
0067       assert(iz - INT8_MIN < 256);
0068       hist.count(izt[i]);
0069       iv[i] = i;
0070       nn[i] = 0;
0071     }
0072     __syncthreads();
0073     if (threadIdx.x < 32)
0074       hws[threadIdx.x] = 0;  // used by prefix scan...
0075     __syncthreads();
0076     hist.finalize(hws);
0077     __syncthreads();
0078     assert(hist.size() == nt);
0079     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0080       hist.fill(izt[i], uint16_t(i));
0081     }
0082     __syncthreads();
0083 
0084     // count neighbours
0085     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0086       if (ezt2[i] > er2mx)
0087         continue;
0088       auto loop = [&](uint32_t j) {
0089         if (i == j)
0090           return;
0091         auto dist = std::abs(zt[i] - zt[j]);
0092         if (dist > eps)
0093           return;
0094         //        if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
0095         nn[i]++;
0096       };
0097 
0098       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0099     }
0100 
0101     __syncthreads();
0102 
0103     // find NN with smaller z...
0104     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0105       if (nn[i] < minT)
0106         continue;  // DBSCAN core rule
0107       float mz = zt[i];
0108       auto loop = [&](uint32_t j) {
0109         if (zt[j] >= mz)
0110           return;
0111         if (nn[j] < minT)
0112           return;  // DBSCAN core rule
0113         auto dist = std::abs(zt[i] - zt[j]);
0114         if (dist > eps)
0115           return;
0116         //        if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
0117         mz = zt[j];
0118         iv[i] = j;  // assign to cluster (better be unique??)
0119       };
0120       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0121     }
0122 
0123     __syncthreads();
0124 
0125 #ifdef GPU_DEBUG
0126     //  mini verification
0127     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0128       if (iv[i] != int(i))
0129         assert(iv[iv[i]] != int(i));
0130     }
0131     __syncthreads();
0132 #endif
0133 
0134     // consolidate graph (percolate index of seed)
0135     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0136       auto m = iv[i];
0137       while (m != iv[m])
0138         m = iv[m];
0139       iv[i] = m;
0140     }
0141 
0142     __syncthreads();
0143 
0144 #ifdef GPU_DEBUG
0145     //  mini verification
0146     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0147       if (iv[i] != int(i))
0148         assert(iv[iv[i]] != int(i));
0149     }
0150     __syncthreads();
0151 #endif
0152 
0153 #ifdef GPU_DEBUG
0154     // and verify that we did not spit any cluster...
0155     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0156       if (nn[i] < minT)
0157         continue;  // DBSCAN core rule
0158       assert(zt[iv[i]] <= zt[i]);
0159       auto loop = [&](uint32_t j) {
0160         if (nn[j] < minT)
0161           return;  // DBSCAN core rule
0162         auto dist = std::abs(zt[i] - zt[j]);
0163         if (dist > eps)
0164           return;
0165         //  if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
0166         // they should belong to the same cluster, isn't it?
0167         if (iv[i] != iv[j]) {
0168           printf("ERROR %d %d %f %f %d\n", i, iv[i], zt[i], zt[iv[i]], iv[iv[i]]);
0169           printf("      %d %d %f %f %d\n", j, iv[j], zt[j], zt[iv[j]], iv[iv[j]]);
0170           ;
0171         }
0172         assert(iv[i] == iv[j]);
0173       };
0174       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0175     }
0176     __syncthreads();
0177 #endif
0178 
0179     // collect edges (assign to closest cluster of closest point??? here to closest point)
0180     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0181       //    if (nn[i]==0 || nn[i]>=minT) continue;    // DBSCAN edge rule
0182       if (nn[i] >= minT)
0183         continue;  // DBSCAN edge rule
0184       float mdist = eps;
0185       auto loop = [&](uint32_t j) {
0186         if (nn[j] < minT)
0187           return;  // DBSCAN core rule
0188         auto dist = std::abs(zt[i] - zt[j]);
0189         if (dist > mdist)
0190           return;
0191         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0192           return;  // needed?
0193         mdist = dist;
0194         iv[i] = iv[j];  // assign to cluster (better be unique??)
0195       };
0196       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0197     }
0198 
0199     __shared__ unsigned int foundClusters;
0200     foundClusters = 0;
0201     __syncthreads();
0202 
0203     // find the number of different clusters, identified by a tracks with clus[i] == i;
0204     // mark these tracks with a negative id.
0205     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0206       if (iv[i] == int(i)) {
0207         if (nn[i] >= minT) {
0208           auto old = atomicInc(&foundClusters, 0xffffffff);
0209           iv[i] = -(old + 1);
0210         } else {  // noise
0211           iv[i] = -9998;
0212         }
0213       }
0214     }
0215     __syncthreads();
0216 
0217     assert(foundClusters < ZVertices::MAXVTX);
0218 
0219     // propagate the negative id to all the tracks in the cluster.
0220     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0221       if (iv[i] >= 0) {
0222         // mark each track in a cluster with the same id as the first one
0223         iv[i] = iv[iv[i]];
0224       }
0225     }
0226     __syncthreads();
0227 
0228     // adjust the cluster id to be a positive value starting from 0
0229     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0230       iv[i] = -iv[i] - 1;
0231     }
0232 
0233     nvIntermediate = nvFinal = foundClusters;
0234 
0235     if (verbose && 0 == threadIdx.x)
0236       printf("found %d proto vertices\n", foundClusters);
0237   }
0238 
0239 }  // namespace gpuVertexFinder
0240 
0241 #endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h