File indexing completed on 2021-05-17 03:00:32
0001 #ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
0002 #define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
0003
0004 #include <algorithm>
0005 #include <cmath>
0006 #include <cstdint>
0007
0008 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0010
0011 #include "gpuVertexFinder.h"
0012
0013 namespace gpuVertexFinder {
0014
0015
0016
0017 __global__ void clusterTracksDBSCAN(ZVertices* pdata,
0018 WorkSpace* pws,
0019 int minT,
0020 float eps,
0021 float errmax,
0022 float chi2max
0023 ) {
0024 constexpr bool verbose = false;
0025
0026 if (verbose && 0 == threadIdx.x)
0027 printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
0028
0029 auto er2mx = errmax * errmax;
0030
0031 auto& __restrict__ data = *pdata;
0032 auto& __restrict__ ws = *pws;
0033 auto nt = ws.ntrks;
0034 float const* __restrict__ zt = ws.zt;
0035 float const* __restrict__ ezt2 = ws.ezt2;
0036
0037 uint32_t& nvFinal = data.nvFinal;
0038 uint32_t& nvIntermediate = ws.nvIntermediate;
0039
0040 uint8_t* __restrict__ izt = ws.izt;
0041 int32_t* __restrict__ nn = data.ndof;
0042 int32_t* __restrict__ iv = ws.iv;
0043
0044 assert(pdata);
0045 assert(zt);
0046
0047 using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
0048 __shared__ Hist hist;
0049 __shared__ typename Hist::Counter hws[32];
0050 for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
0051 hist.off[j] = 0;
0052 }
0053 __syncthreads();
0054
0055 if (verbose && 0 == threadIdx.x)
0056 printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
0057
0058 assert((int)nt <= hist.capacity());
0059
0060
0061 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0062 assert(i < ZVertices::MAXTRACKS);
0063 int iz = int(zt[i] * 10.);
0064 iz = std::clamp(iz, INT8_MIN, INT8_MAX);
0065 izt[i] = iz - INT8_MIN;
0066 assert(iz - INT8_MIN >= 0);
0067 assert(iz - INT8_MIN < 256);
0068 hist.count(izt[i]);
0069 iv[i] = i;
0070 nn[i] = 0;
0071 }
0072 __syncthreads();
0073 if (threadIdx.x < 32)
0074 hws[threadIdx.x] = 0;
0075 __syncthreads();
0076 hist.finalize(hws);
0077 __syncthreads();
0078 assert(hist.size() == nt);
0079 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0080 hist.fill(izt[i], uint16_t(i));
0081 }
0082 __syncthreads();
0083
0084
0085 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0086 if (ezt2[i] > er2mx)
0087 continue;
0088 auto loop = [&](uint32_t j) {
0089 if (i == j)
0090 return;
0091 auto dist = std::abs(zt[i] - zt[j]);
0092 if (dist > eps)
0093 return;
0094
0095 nn[i]++;
0096 };
0097
0098 cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0099 }
0100
0101 __syncthreads();
0102
0103
0104 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0105 if (nn[i] < minT)
0106 continue;
0107 float mz = zt[i];
0108 auto loop = [&](uint32_t j) {
0109 if (zt[j] >= mz)
0110 return;
0111 if (nn[j] < minT)
0112 return;
0113 auto dist = std::abs(zt[i] - zt[j]);
0114 if (dist > eps)
0115 return;
0116
0117 mz = zt[j];
0118 iv[i] = j;
0119 };
0120 cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0121 }
0122
0123 __syncthreads();
0124
0125 #ifdef GPU_DEBUG
0126
0127 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0128 if (iv[i] != int(i))
0129 assert(iv[iv[i]] != int(i));
0130 }
0131 __syncthreads();
0132 #endif
0133
0134
0135 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0136 auto m = iv[i];
0137 while (m != iv[m])
0138 m = iv[m];
0139 iv[i] = m;
0140 }
0141
0142 __syncthreads();
0143
0144 #ifdef GPU_DEBUG
0145
0146 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0147 if (iv[i] != int(i))
0148 assert(iv[iv[i]] != int(i));
0149 }
0150 __syncthreads();
0151 #endif
0152
0153 #ifdef GPU_DEBUG
0154
0155 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0156 if (nn[i] < minT)
0157 continue;
0158 assert(zt[iv[i]] <= zt[i]);
0159 auto loop = [&](uint32_t j) {
0160 if (nn[j] < minT)
0161 return;
0162 auto dist = std::abs(zt[i] - zt[j]);
0163 if (dist > eps)
0164 return;
0165
0166
0167 if (iv[i] != iv[j]) {
0168 printf("ERROR %d %d %f %f %d\n", i, iv[i], zt[i], zt[iv[i]], iv[iv[i]]);
0169 printf(" %d %d %f %f %d\n", j, iv[j], zt[j], zt[iv[j]], iv[iv[j]]);
0170 ;
0171 }
0172 assert(iv[i] == iv[j]);
0173 };
0174 cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0175 }
0176 __syncthreads();
0177 #endif
0178
0179
0180 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0181
0182 if (nn[i] >= minT)
0183 continue;
0184 float mdist = eps;
0185 auto loop = [&](uint32_t j) {
0186 if (nn[j] < minT)
0187 return;
0188 auto dist = std::abs(zt[i] - zt[j]);
0189 if (dist > mdist)
0190 return;
0191 if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0192 return;
0193 mdist = dist;
0194 iv[i] = iv[j];
0195 };
0196 cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0197 }
0198
0199 __shared__ unsigned int foundClusters;
0200 foundClusters = 0;
0201 __syncthreads();
0202
0203
0204
0205 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0206 if (iv[i] == int(i)) {
0207 if (nn[i] >= minT) {
0208 auto old = atomicInc(&foundClusters, 0xffffffff);
0209 iv[i] = -(old + 1);
0210 } else {
0211 iv[i] = -9998;
0212 }
0213 }
0214 }
0215 __syncthreads();
0216
0217 assert(foundClusters < ZVertices::MAXVTX);
0218
0219
0220 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0221 if (iv[i] >= 0) {
0222
0223 iv[i] = iv[iv[i]];
0224 }
0225 }
0226 __syncthreads();
0227
0228
0229 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0230 iv[i] = -iv[i] - 1;
0231 }
0232
0233 nvIntermediate = nvFinal = foundClusters;
0234
0235 if (verbose && 0 == threadIdx.x)
0236 printf("found %d proto vertices\n", foundClusters);
0237 }
0238
0239 }
0240
0241 #endif