PixelVertexFinding/plugins/gpuClusterTracksByDensity.h

0001 #ifndef RecoVertex_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
0002 #define RecoVertex_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
0003
0004 #include <algorithm>
0005 #include <cmath>
0006 #include <cstdint>
0007
0008 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0010
0011 #include "gpuVertexFinder.h"
0012
0013 namespace gpuVertexFinder {
0014
0015   // this algo does not really scale as it works in a single block...
0016   // enough for <10K tracks we have
0017   //
0018   // based on Rodrighez&Laio algo
0019   //
0020   __device__ __forceinline__ void clusterTracksByDensity(VtxSoAView& pdata,
0021                                                          WsSoAView& pws,
0022                                                          int minT,      // min number of neighbours to be "seed"
0023                                                          float eps,     // max absolute distance to cluster
0024                                                          float errmax,  // max error to be "seed"
0025                                                          float chi2max  // max normalized distance to cluster
0026   ) {
0027     using namespace gpuVertexFinder;
0028     constexpr bool verbose = false;  // in principle the compiler should optmize out if false
0029
0030     if (verbose && 0 == threadIdx.x)
0031       printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
0032
0033     auto er2mx = errmax * errmax;
0034
0035     auto& __restrict__ data = pdata;
0036     auto& __restrict__ ws = pws;
0037     auto nt = ws.ntrks();
0038     float const* __restrict__ zt = ws.zt();
0039     float const* __restrict__ ezt2 = ws.ezt2();
0040
0041     uint32_t& nvFinal = data.nvFinal();
0042     uint32_t& nvIntermediate = ws.nvIntermediate();
0043
0044     uint8_t* __restrict__ izt = ws.izt();
0045     int32_t* __restrict__ nn = data.ndof();
0046     int32_t* __restrict__ iv = ws.iv();
0047
0048     assert(zt);
0049     assert(ezt2);
0050     assert(izt);
0051     assert(nn);
0052     assert(iv);
0053
0054     using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
0055     __shared__ Hist hist;
0056     __shared__ typename Hist::Counter hws[32];
0057     for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
0058       hist.off[j] = 0;
0059     }
0060     __syncthreads();
0061
0062     if (verbose && 0 == threadIdx.x)
0063       printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
0064
0065     assert((int)nt <= hist.capacity());
0066
0067     // fill hist  (bin shall be wider than "eps")
0068     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0069       assert(i < zVertex::utilities::MAXTRACKS);
0070       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
0071       // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
0072       iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
0073       izt[i] = iz - INT8_MIN;
0074       assert(iz - INT8_MIN >= 0);
0075       assert(iz - INT8_MIN < 256);
0076       hist.count(izt[i]);
0077       iv[i] = i;
0078       nn[i] = 0;
0079     }
0080     __syncthreads();
0081     if (threadIdx.x < 32)
0082       hws[threadIdx.x] = 0;  // used by prefix scan...
0083     __syncthreads();
0084     hist.finalize(hws);
0085     __syncthreads();
0086     assert(hist.size() == nt);
0087     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0088       hist.fill(izt[i], uint16_t(i));
0089     }
0090     __syncthreads();
0091
0092     // count neighbours
0093     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0094       if (ezt2[i] > er2mx)
0095         continue;
0096       auto loop = [&](uint32_t j) {
0097         if (i == j)
0098           return;
0099         auto dist = std::abs(zt[i] - zt[j]);
0100         if (dist > eps)
0101           return;
0102         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0103           return;
0104         nn[i]++;
0105       };
0106
0107       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0108     }
0109
0110     __syncthreads();
0111
0112     // find closest above me .... (we ignore the possibility of two j at same distance from i)
0113     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0114       float mdist = eps;
0115       auto loop = [&](uint32_t j) {
0116         if (nn[j] < nn[i])
0117           return;
0118         if (nn[j] == nn[i] && zt[j] >= zt[i])
0119           return;  // if equal use natural order...
0120         auto dist = std::abs(zt[i] - zt[j]);
0121         if (dist > mdist)
0122           return;
0123         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0124           return;  // (break natural order???)
0125         mdist = dist;
0126         iv[i] = j;  // assign to cluster (better be unique??)
0127       };
0128       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0129     }
0130
0131     __syncthreads();
0132
0133 #ifdef GPU_DEBUG
0134     //  mini verification
0135     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0136       if (iv[i] != int(i))
0137         assert(iv[iv[i]] != int(i));
0138     }
0139     __syncthreads();
0140 #endif
0141
0142     // consolidate graph (percolate index of seed)
0143     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0144       auto m = iv[i];
0145       while (m != iv[m])
0146         m = iv[m];
0147       iv[i] = m;
0148     }
0149
0150 #ifdef GPU_DEBUG
0151     __syncthreads();
0152     //  mini verification
0153     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0154       if (iv[i] != int(i))
0155         assert(iv[iv[i]] != int(i));
0156     }
0157 #endif
0158
0159 #ifdef GPU_DEBUG
0160     // and verify that we did not spit any cluster...
0161     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0162       auto minJ = i;
0163       auto mdist = eps;
0164       auto loop = [&](uint32_t j) {
0165         if (nn[j] < nn[i])
0166           return;
0167         if (nn[j] == nn[i] && zt[j] >= zt[i])
0168           return;  // if equal use natural order...
0169         auto dist = std::abs(zt[i] - zt[j]);
0170         if (dist > mdist)
0171           return;
0172         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0173           return;
0174         mdist = dist;
0175         minJ = j;
0176       };
0177       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0178       // should belong to the same cluster...
0179       assert(iv[i] == iv[minJ]);
0180       assert(nn[i] <= nn[iv[i]]);
0181     }
0182     __syncthreads();
0183 #endif
0184
0185     __shared__ unsigned int foundClusters;
0186     foundClusters = 0;
0187     __syncthreads();
0188
0189     // find the number of different clusters, identified by a tracks with clus[i] == i and density larger than threshold;
0190     // mark these tracks with a negative id.
0191     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0192       if (iv[i] == int(i)) {
0193         if (nn[i] >= minT) {
0194           auto old = atomicInc(&foundClusters, 0xffffffff);
0195           iv[i] = -(old + 1);
0196         } else {  // noise
0197           iv[i] = -9998;
0198         }
0199       }
0200     }
0201     __syncthreads();
0202
0203     assert(foundClusters < zVertex::utilities::MAXVTX);
0204
0205     // propagate the negative id to all the tracks in the cluster.
0206     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0207       if (iv[i] >= 0) {
0208         // mark each track in a cluster with the same id as the first one
0209         iv[i] = iv[iv[i]];
0210       }
0211     }
0212     __syncthreads();
0213
0214     // adjust the cluster id to be a positive value starting from 0
0215     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0216       iv[i] = -iv[i] - 1;
0217     }
0218
0219     nvIntermediate = nvFinal = foundClusters;
0220
0221     if (verbose && 0 == threadIdx.x)
0222       printf("found %d proto vertices\n", foundClusters);
0223   }
0224
0225   __global__ void clusterTracksByDensityKernel(VtxSoAView pdata,
0226                                                WsSoAView pws,
0227                                                int minT,      // min number of neighbours to be "seed"
0228                                                float eps,     // max absolute distance to cluster
0229                                                float errmax,  // max error to be "seed"
0230                                                float chi2max  // max normalized distance to cluster
0231   ) {
0232     clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
0233   }
0234
0235 }  // namespace gpuVertexFinder
0236
0237 #endif  // RecoVertex_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h