PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h

0001 #ifndef RecoVertex_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
0002 #define RecoVertex_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
0003
0004 #include <algorithm>
0005 #include <cmath>
0006 #include <cstdint>
0007
0008 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0010
0011 #include "gpuVertexFinder.h"
0012
0013 namespace gpuVertexFinder {
0014
0015   // this algo does not really scale as it works in a single block...
0016   // enough for <10K tracks we have
0017   __global__ void clusterTracksDBSCAN(VtxSoAView pdata,
0018                                       WsSoAView pws,
0019                                       int minT,      // min number of neighbours to be "core"
0020                                       float eps,     // max absolute distance to cluster
0021                                       float errmax,  // max error to be "seed"
0022                                       float chi2max  // max normalized distance to cluster
0023   ) {
0024     constexpr bool verbose = false;  // in principle the compiler should optmize out if false
0025
0026     if (verbose && 0 == threadIdx.x)
0027       printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
0028
0029     auto er2mx = errmax * errmax;
0030
0031     auto& __restrict__ data = pdata;
0032     auto& __restrict__ ws = pws;
0033     auto nt = ws.ntrks();
0034     float const* __restrict__ zt = ws.zt();
0035     float const* __restrict__ ezt2 = ws.ezt2();
0036
0037     uint32_t& nvFinal = data.nvFinal();
0038     uint32_t& nvIntermediate = ws.nvIntermediate();
0039
0040     uint8_t* __restrict__ izt = ws.izt();
0041     int32_t* __restrict__ nn = data.ndof();
0042     int32_t* __restrict__ iv = ws.iv();
0043
0044     assert(zt);
0045     assert(iv);
0046     assert(nn);
0047     assert(ezt2);
0048
0049     using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
0050     __shared__ Hist hist;
0051     __shared__ typename Hist::Counter hws[32];
0052     for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
0053       hist.off[j] = 0;
0054     }
0055     __syncthreads();
0056
0057     if (verbose && 0 == threadIdx.x)
0058       printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
0059
0060     assert((int)nt <= hist.capacity());
0061
0062     // fill hist  (bin shall be wider than "eps")
0063     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0064       assert(i < zVertex::utilities::MAXTRACKS);
0065       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
0066       iz = std::clamp(iz, INT8_MIN, INT8_MAX);
0067       izt[i] = iz - INT8_MIN;
0068       assert(iz - INT8_MIN >= 0);
0069       assert(iz - INT8_MIN < 256);
0070       hist.count(izt[i]);
0071       iv[i] = i;
0072       nn[i] = 0;
0073     }
0074     __syncthreads();
0075     if (threadIdx.x < 32)
0076       hws[threadIdx.x] = 0;  // used by prefix scan...
0077     __syncthreads();
0078     hist.finalize(hws);
0079     __syncthreads();
0080     assert(hist.size() == nt);
0081     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0082       hist.fill(izt[i], uint16_t(i));
0083     }
0084     __syncthreads();
0085
0086     // count neighbours
0087     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0088       if (ezt2[i] > er2mx)
0089         continue;
0090       auto loop = [&](uint32_t j) {
0091         if (i == j)
0092           return;
0093         auto dist = std::abs(zt[i] - zt[j]);
0094         if (dist > eps)
0095           return;
0096         //        if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
0097         nn[i]++;
0098       };
0099
0100       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0101     }
0102
0103     __syncthreads();
0104
0105     // find NN with smaller z...
0106     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0107       if (nn[i] < minT)
0108         continue;  // DBSCAN core rule
0109       float mz = zt[i];
0110       auto loop = [&](uint32_t j) {
0111         if (zt[j] >= mz)
0112           return;
0113         if (nn[j] < minT)
0114           return;  // DBSCAN core rule
0115         auto dist = std::abs(zt[i] - zt[j]);
0116         if (dist > eps)
0117           return;
0118         //        if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
0119         mz = zt[j];
0120         iv[i] = j;  // assign to cluster (better be unique??)
0121       };
0122       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0123     }
0124
0125     __syncthreads();
0126
0127 #ifdef GPU_DEBUG
0128     //  mini verification
0129     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0130       if (iv[i] != int(i))
0131         assert(iv[iv[i]] != int(i));
0132     }
0133     __syncthreads();
0134 #endif
0135
0136     // consolidate graph (percolate index of seed)
0137     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0138       auto m = iv[i];
0139       while (m != iv[m])
0140         m = iv[m];
0141       iv[i] = m;
0142     }
0143
0144     __syncthreads();
0145
0146 #ifdef GPU_DEBUG
0147     //  mini verification
0148     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0149       if (iv[i] != int(i))
0150         assert(iv[iv[i]] != int(i));
0151     }
0152     __syncthreads();
0153 #endif
0154
0155 #ifdef GPU_DEBUG
0156     // and verify that we did not spit any cluster...
0157     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0158       if (nn[i] < minT)
0159         continue;  // DBSCAN core rule
0160       assert(zt[iv[i]] <= zt[i]);
0161       auto loop = [&](uint32_t j) {
0162         if (nn[j] < minT)
0163           return;  // DBSCAN core rule
0164         auto dist = std::abs(zt[i] - zt[j]);
0165         if (dist > eps)
0166           return;
0167         //  if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
0168         // they should belong to the same cluster, isn't it?
0169         if (iv[i] != iv[j]) {
0170           printf("ERROR %d %d %f %f %d\n", i, iv[i], zt[i], zt[iv[i]], iv[iv[i]]);
0171           printf("      %d %d %f %f %d\n", j, iv[j], zt[j], zt[iv[j]], iv[iv[j]]);
0172           ;
0173         }
0174         assert(iv[i] == iv[j]);
0175       };
0176       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0177     }
0178     __syncthreads();
0179 #endif
0180
0181     // collect edges (assign to closest cluster of closest point??? here to closest point)
0182     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0183       //    if (nn[i]==0 || nn[i]>=minT) continue;    // DBSCAN edge rule
0184       if (nn[i] >= minT)
0185         continue;  // DBSCAN edge rule
0186       float mdist = eps;
0187       auto loop = [&](uint32_t j) {
0188         if (nn[j] < minT)
0189           return;  // DBSCAN core rule
0190         auto dist = std::abs(zt[i] - zt[j]);
0191         if (dist > mdist)
0192           return;
0193         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0194           return;  // needed?
0195         mdist = dist;
0196         iv[i] = iv[j];  // assign to cluster (better be unique??)
0197       };
0198       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0199     }
0200
0201     __shared__ unsigned int foundClusters;
0202     foundClusters = 0;
0203     __syncthreads();
0204
0205     // find the number of different clusters, identified by a tracks with clus[i] == i;
0206     // mark these tracks with a negative id.
0207     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0208       if (iv[i] == int(i)) {
0209         if (nn[i] >= minT) {
0210           auto old = atomicInc(&foundClusters, 0xffffffff);
0211           iv[i] = -(old + 1);
0212         } else {  // noise
0213           iv[i] = -9998;
0214         }
0215       }
0216     }
0217     __syncthreads();
0218
0219     assert(foundClusters < zVertex::utilities::MAXVTX);
0220
0221     // propagate the negative id to all the tracks in the cluster.
0222     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0223       if (iv[i] >= 0) {
0224         // mark each track in a cluster with the same id as the first one
0225         iv[i] = iv[iv[i]];
0226       }
0227     }
0228     __syncthreads();
0229
0230     // adjust the cluster id to be a positive value starting from 0
0231     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0232       iv[i] = -iv[i] - 1;
0233     }
0234
0235     nvIntermediate = nvFinal = foundClusters;
0236
0237     if (verbose && 0 == threadIdx.x)
0238       printf("found %d proto vertices\n", foundClusters);
0239   }
0240
0241 }  // namespace gpuVertexFinder
0242
0243 #endif  // RecoVertex_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h