File indexing completed on 2024-09-07 04:38:04
0001 #ifndef RecoVertex_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
0002 #define RecoVertex_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
0003
0004 #include <algorithm>
0005 #include <cmath>
0006 #include <cstdint>
0007
0008 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0010
0011 #include "gpuVertexFinder.h"
0012
0013 namespace gpuVertexFinder {
0014
0015
0016
0017 __global__ void clusterTracksDBSCAN(VtxSoAView pdata,
0018 WsSoAView pws,
0019 int minT,
0020 float eps,
0021 float errmax,
0022 float chi2max
0023 ) {
0024 constexpr bool verbose = false;
0025
0026 if (verbose && 0 == threadIdx.x)
0027 printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
0028
0029 auto er2mx = errmax * errmax;
0030
0031 auto& __restrict__ data = pdata;
0032 auto& __restrict__ ws = pws;
0033 auto nt = ws.ntrks();
0034 float const* __restrict__ zt = ws.zt();
0035 float const* __restrict__ ezt2 = ws.ezt2();
0036
0037 uint32_t& nvFinal = data.nvFinal();
0038 uint32_t& nvIntermediate = ws.nvIntermediate();
0039
0040 uint8_t* __restrict__ izt = ws.izt();
0041 int32_t* __restrict__ nn = data.ndof();
0042 int32_t* __restrict__ iv = ws.iv();
0043
0044 assert(zt);
0045 assert(iv);
0046 assert(nn);
0047 assert(ezt2);
0048
0049 using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
0050 __shared__ Hist hist;
0051 __shared__ typename Hist::Counter hws[32];
0052 for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
0053 hist.off[j] = 0;
0054 }
0055 __syncthreads();
0056
0057 if (verbose && 0 == threadIdx.x)
0058 printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
0059
0060 assert((int)nt <= hist.capacity());
0061
0062
0063 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0064 assert(i < zVertex::utilities::MAXTRACKS);
0065 int iz = int(zt[i] * 10.);
0066 iz = std::clamp(iz, INT8_MIN, INT8_MAX);
0067 izt[i] = iz - INT8_MIN;
0068 assert(iz - INT8_MIN >= 0);
0069 assert(iz - INT8_MIN < 256);
0070 hist.count(izt[i]);
0071 iv[i] = i;
0072 nn[i] = 0;
0073 }
0074 __syncthreads();
0075 if (threadIdx.x < 32)
0076 hws[threadIdx.x] = 0;
0077 __syncthreads();
0078 hist.finalize(hws);
0079 __syncthreads();
0080 assert(hist.size() == nt);
0081 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0082 hist.fill(izt[i], uint16_t(i));
0083 }
0084 __syncthreads();
0085
0086
0087 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0088 if (ezt2[i] > er2mx)
0089 continue;
0090 auto loop = [&](uint32_t j) {
0091 if (i == j)
0092 return;
0093 auto dist = std::abs(zt[i] - zt[j]);
0094 if (dist > eps)
0095 return;
0096
0097 nn[i]++;
0098 };
0099
0100 cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0101 }
0102
0103 __syncthreads();
0104
0105
0106 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0107 if (nn[i] < minT)
0108 continue;
0109 float mz = zt[i];
0110 auto loop = [&](uint32_t j) {
0111 if (zt[j] >= mz)
0112 return;
0113 if (nn[j] < minT)
0114 return;
0115 auto dist = std::abs(zt[i] - zt[j]);
0116 if (dist > eps)
0117 return;
0118
0119 mz = zt[j];
0120 iv[i] = j;
0121 };
0122 cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0123 }
0124
0125 __syncthreads();
0126
0127 #ifdef GPU_DEBUG
0128
0129 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0130 if (iv[i] != int(i))
0131 assert(iv[iv[i]] != int(i));
0132 }
0133 __syncthreads();
0134 #endif
0135
0136
0137 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0138 auto m = iv[i];
0139 while (m != iv[m])
0140 m = iv[m];
0141 iv[i] = m;
0142 }
0143
0144 __syncthreads();
0145
0146 #ifdef GPU_DEBUG
0147
0148 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0149 if (iv[i] != int(i))
0150 assert(iv[iv[i]] != int(i));
0151 }
0152 __syncthreads();
0153 #endif
0154
0155 #ifdef GPU_DEBUG
0156
0157 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0158 if (nn[i] < minT)
0159 continue;
0160 assert(zt[iv[i]] <= zt[i]);
0161 auto loop = [&](uint32_t j) {
0162 if (nn[j] < minT)
0163 return;
0164 auto dist = std::abs(zt[i] - zt[j]);
0165 if (dist > eps)
0166 return;
0167
0168
0169 if (iv[i] != iv[j]) {
0170 printf("ERROR %d %d %f %f %d\n", i, iv[i], zt[i], zt[iv[i]], iv[iv[i]]);
0171 printf(" %d %d %f %f %d\n", j, iv[j], zt[j], zt[iv[j]], iv[iv[j]]);
0172 ;
0173 }
0174 assert(iv[i] == iv[j]);
0175 };
0176 cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0177 }
0178 __syncthreads();
0179 #endif
0180
0181
0182 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0183
0184 if (nn[i] >= minT)
0185 continue;
0186 float mdist = eps;
0187 auto loop = [&](uint32_t j) {
0188 if (nn[j] < minT)
0189 return;
0190 auto dist = std::abs(zt[i] - zt[j]);
0191 if (dist > mdist)
0192 return;
0193 if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0194 return;
0195 mdist = dist;
0196 iv[i] = iv[j];
0197 };
0198 cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0199 }
0200
0201 __shared__ unsigned int foundClusters;
0202 foundClusters = 0;
0203 __syncthreads();
0204
0205
0206
0207 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0208 if (iv[i] == int(i)) {
0209 if (nn[i] >= minT) {
0210 auto old = atomicInc(&foundClusters, 0xffffffff);
0211 iv[i] = -(old + 1);
0212 } else {
0213 iv[i] = -9998;
0214 }
0215 }
0216 }
0217 __syncthreads();
0218
0219 assert(foundClusters < zVertex::utilities::MAXVTX);
0220
0221
0222 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0223 if (iv[i] >= 0) {
0224
0225 iv[i] = iv[iv[i]];
0226 }
0227 }
0228 __syncthreads();
0229
0230
0231 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0232 iv[i] = -iv[i] - 1;
0233 }
0234
0235 nvIntermediate = nvFinal = foundClusters;
0236
0237 if (verbose && 0 == threadIdx.x)
0238 printf("found %d proto vertices\n", foundClusters);
0239 }
0240
0241 }
0242
0243 #endif