PixelVertexFinding/plugins/gpuClusterTracksIterative.h

0001 #ifndef RecoVertex_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
0002 #define RecoVertex_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
0003
0004 #include <algorithm>
0005 #include <cmath>
0006 #include <cstdint>
0007
0008 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0010
0011 #include "gpuVertexFinder.h"
0012
0013 namespace gpuVertexFinder {
0014
0015   // this algo does not really scale as it works in a single block...
0016   // enough for <10K tracks we have
0017   __global__ void clusterTracksIterative(VtxSoAView pdata,
0018                                          WsSoAView pws,
0019                                          int minT,      // min number of neighbours to be "core"
0020                                          float eps,     // max absolute distance to cluster
0021                                          float errmax,  // max error to be "seed"
0022                                          float chi2max  // max normalized distance to cluster
0023   ) {
0024     constexpr bool verbose = false;  // in principle the compiler should optmize out if false
0025
0026     if (verbose && 0 == threadIdx.x)
0027       printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
0028
0029     auto er2mx = errmax * errmax;
0030
0031     auto& __restrict__ data = pdata;
0032     auto& __restrict__ ws = pws;
0033     auto nt = ws.ntrks();
0034     float const* __restrict__ zt = ws.zt();
0035     float const* __restrict__ ezt2 = ws.ezt2();
0036
0037     uint32_t& nvFinal = data.nvFinal();
0038     uint32_t& nvIntermediate = ws.nvIntermediate();
0039
0040     uint8_t* __restrict__ izt = ws.izt();
0041     int32_t* __restrict__ nn = data.ndof();
0042     int32_t* __restrict__ iv = ws.iv();
0043
0044     assert(zt);
0045     assert(nn);
0046     assert(iv);
0047     assert(ezt2);
0048
0049     using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
0050     __shared__ Hist hist;
0051     __shared__ typename Hist::Counter hws[32];
0052     for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
0053       hist.off[j] = 0;
0054     }
0055     __syncthreads();
0056
0057     if (verbose && 0 == threadIdx.x)
0058       printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
0059
0060     assert((int)nt <= hist.capacity());
0061
0062     // fill hist  (bin shall be wider than "eps")
0063     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0064       assert(i < zVertex::utilities::MAXTRACKS);
0065       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
0066       iz = std::clamp(iz, INT8_MIN, INT8_MAX);
0067       izt[i] = iz - INT8_MIN;
0068       assert(iz - INT8_MIN >= 0);
0069       assert(iz - INT8_MIN < 256);
0070       hist.count(izt[i]);
0071       iv[i] = i;
0072       nn[i] = 0;
0073     }
0074     __syncthreads();
0075     if (threadIdx.x < 32)
0076       hws[threadIdx.x] = 0;  // used by prefix scan...
0077     __syncthreads();
0078     hist.finalize(hws);
0079     __syncthreads();
0080     assert(hist.size() == nt);
0081     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0082       hist.fill(izt[i], uint16_t(i));
0083     }
0084     __syncthreads();
0085
0086     // count neighbours
0087     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0088       if (ezt2[i] > er2mx)
0089         continue;
0090       auto loop = [&](uint32_t j) {
0091         if (i == j)
0092           return;
0093         auto dist = std::abs(zt[i] - zt[j]);
0094         if (dist > eps)
0095           return;
0096         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0097           return;
0098         nn[i]++;
0099       };
0100
0101       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0102     }
0103
0104     __shared__ int nloops;
0105     nloops = 0;
0106
0107     __syncthreads();
0108
0109     // cluster seeds only
0110     bool more = true;
0111     while (__syncthreads_or(more)) {
0112       if (1 == nloops % 2) {
0113         for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0114           auto m = iv[i];
0115           while (m != iv[m])
0116             m = iv[m];
0117           iv[i] = m;
0118         }
0119       } else {
0120         more = false;
0121         for (auto k = threadIdx.x; k < hist.size(); k += blockDim.x) {
0122           auto p = hist.begin() + k;
0123           auto i = (*p);
0124           auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1));
0125           if (nn[i] < minT)
0126             continue;  // DBSCAN core rule
0127           auto loop = [&](uint32_t j) {
0128             assert(i != j);
0129             if (nn[j] < minT)
0130               return;  // DBSCAN core rule
0131             auto dist = std::abs(zt[i] - zt[j]);
0132             if (dist > eps)
0133               return;
0134             if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0135               return;
0136             auto old = atomicMin(&iv[j], iv[i]);
0137             if (old != iv[i]) {
0138               // end the loop only if no changes were applied
0139               more = true;
0140             }
0141             atomicMin(&iv[i], old);
0142           };
0143           ++p;
0144           for (; p < hist.end(be); ++p)
0145             loop(*p);
0146         }  // for i
0147       }
0148       if (threadIdx.x == 0)
0149         ++nloops;
0150     }  // while
0151
0152     // collect edges (assign to closest cluster of closest point??? here to closest point)
0153     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0154       //    if (nn[i]==0 || nn[i]>=minT) continue;    // DBSCAN edge rule
0155       if (nn[i] >= minT)
0156         continue;  // DBSCAN edge rule
0157       float mdist = eps;
0158       auto loop = [&](int j) {
0159         if (nn[j] < minT)
0160           return;  // DBSCAN core rule
0161         auto dist = std::abs(zt[i] - zt[j]);
0162         if (dist > mdist)
0163           return;
0164         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0165           return;  // needed?
0166         mdist = dist;
0167         iv[i] = iv[j];  // assign to cluster (better be unique??)
0168       };
0169       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0170     }
0171
0172     __shared__ unsigned int foundClusters;
0173     foundClusters = 0;
0174     __syncthreads();
0175
0176     // find the number of different clusters, identified by a tracks with clus[i] == i;
0177     // mark these tracks with a negative id.
0178     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0179       if (iv[i] == int(i)) {
0180         if (nn[i] >= minT) {
0181           auto old = atomicInc(&foundClusters, 0xffffffff);
0182           iv[i] = -(old + 1);
0183         } else {  // noise
0184           iv[i] = -9998;
0185         }
0186       }
0187     }
0188     __syncthreads();
0189
0190     assert(foundClusters < zVertex::utilities::MAXVTX);
0191
0192     // propagate the negative id to all the tracks in the cluster.
0193     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0194       if (iv[i] >= 0) {
0195         // mark each track in a cluster with the same id as the first one
0196         iv[i] = iv[iv[i]];
0197       }
0198     }
0199     __syncthreads();
0200
0201     // adjust the cluster id to be a positive value starting from 0
0202     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0203       iv[i] = -iv[i] - 1;
0204     }
0205
0206     nvIntermediate = nvFinal = foundClusters;
0207
0208     if (verbose && 0 == threadIdx.x)
0209       printf("found %d proto vertices\n", foundClusters);
0210   }
0211
0212 }  // namespace gpuVertexFinder
0213
0214 #endif  // RecoVertex_PixelVertexFinding_plugins_gpuClusterTracksIterative_h