File indexing completed on 2024-09-07 04:38:04
0001 #ifndef RecoVertex_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
0002 #define RecoVertex_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
0003
0004 #include <algorithm>
0005 #include <cmath>
0006 #include <cstdint>
0007
0008 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0010
0011 #include "gpuVertexFinder.h"
0012
0013 namespace gpuVertexFinder {
0014
0015
0016
0017
0018
0019
0020 __device__ __forceinline__ void clusterTracksByDensity(VtxSoAView& pdata,
0021 WsSoAView& pws,
0022 int minT,
0023 float eps,
0024 float errmax,
0025 float chi2max
0026 ) {
0027 using namespace gpuVertexFinder;
0028 constexpr bool verbose = false;
0029
0030 if (verbose && 0 == threadIdx.x)
0031 printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
0032
0033 auto er2mx = errmax * errmax;
0034
0035 auto& __restrict__ data = pdata;
0036 auto& __restrict__ ws = pws;
0037 auto nt = ws.ntrks();
0038 float const* __restrict__ zt = ws.zt();
0039 float const* __restrict__ ezt2 = ws.ezt2();
0040
0041 uint32_t& nvFinal = data.nvFinal();
0042 uint32_t& nvIntermediate = ws.nvIntermediate();
0043
0044 uint8_t* __restrict__ izt = ws.izt();
0045 int32_t* __restrict__ nn = data.ndof();
0046 int32_t* __restrict__ iv = ws.iv();
0047
0048 assert(zt);
0049 assert(ezt2);
0050 assert(izt);
0051 assert(nn);
0052 assert(iv);
0053
0054 using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
0055 __shared__ Hist hist;
0056 __shared__ typename Hist::Counter hws[32];
0057 for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
0058 hist.off[j] = 0;
0059 }
0060 __syncthreads();
0061
0062 if (verbose && 0 == threadIdx.x)
0063 printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
0064
0065 assert((int)nt <= hist.capacity());
0066
0067
0068 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0069 assert(i < zVertex::utilities::MAXTRACKS);
0070 int iz = int(zt[i] * 10.);
0071
0072 iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
0073 izt[i] = iz - INT8_MIN;
0074 assert(iz - INT8_MIN >= 0);
0075 assert(iz - INT8_MIN < 256);
0076 hist.count(izt[i]);
0077 iv[i] = i;
0078 nn[i] = 0;
0079 }
0080 __syncthreads();
0081 if (threadIdx.x < 32)
0082 hws[threadIdx.x] = 0;
0083 __syncthreads();
0084 hist.finalize(hws);
0085 __syncthreads();
0086 assert(hist.size() == nt);
0087 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0088 hist.fill(izt[i], uint16_t(i));
0089 }
0090 __syncthreads();
0091
0092
0093 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0094 if (ezt2[i] > er2mx)
0095 continue;
0096 auto loop = [&](uint32_t j) {
0097 if (i == j)
0098 return;
0099 auto dist = std::abs(zt[i] - zt[j]);
0100 if (dist > eps)
0101 return;
0102 if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0103 return;
0104 nn[i]++;
0105 };
0106
0107 cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0108 }
0109
0110 __syncthreads();
0111
0112
0113 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0114 float mdist = eps;
0115 auto loop = [&](uint32_t j) {
0116 if (nn[j] < nn[i])
0117 return;
0118 if (nn[j] == nn[i] && zt[j] >= zt[i])
0119 return;
0120 auto dist = std::abs(zt[i] - zt[j]);
0121 if (dist > mdist)
0122 return;
0123 if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0124 return;
0125 mdist = dist;
0126 iv[i] = j;
0127 };
0128 cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0129 }
0130
0131 __syncthreads();
0132
0133 #ifdef GPU_DEBUG
0134
0135 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0136 if (iv[i] != int(i))
0137 assert(iv[iv[i]] != int(i));
0138 }
0139 __syncthreads();
0140 #endif
0141
0142
0143 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0144 auto m = iv[i];
0145 while (m != iv[m])
0146 m = iv[m];
0147 iv[i] = m;
0148 }
0149
0150 #ifdef GPU_DEBUG
0151 __syncthreads();
0152
0153 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0154 if (iv[i] != int(i))
0155 assert(iv[iv[i]] != int(i));
0156 }
0157 #endif
0158
0159 #ifdef GPU_DEBUG
0160
0161 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0162 auto minJ = i;
0163 auto mdist = eps;
0164 auto loop = [&](uint32_t j) {
0165 if (nn[j] < nn[i])
0166 return;
0167 if (nn[j] == nn[i] && zt[j] >= zt[i])
0168 return;
0169 auto dist = std::abs(zt[i] - zt[j]);
0170 if (dist > mdist)
0171 return;
0172 if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0173 return;
0174 mdist = dist;
0175 minJ = j;
0176 };
0177 cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0178
0179 assert(iv[i] == iv[minJ]);
0180 assert(nn[i] <= nn[iv[i]]);
0181 }
0182 __syncthreads();
0183 #endif
0184
0185 __shared__ unsigned int foundClusters;
0186 foundClusters = 0;
0187 __syncthreads();
0188
0189
0190
0191 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0192 if (iv[i] == int(i)) {
0193 if (nn[i] >= minT) {
0194 auto old = atomicInc(&foundClusters, 0xffffffff);
0195 iv[i] = -(old + 1);
0196 } else {
0197 iv[i] = -9998;
0198 }
0199 }
0200 }
0201 __syncthreads();
0202
0203 assert(foundClusters < zVertex::utilities::MAXVTX);
0204
0205
0206 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0207 if (iv[i] >= 0) {
0208
0209 iv[i] = iv[iv[i]];
0210 }
0211 }
0212 __syncthreads();
0213
0214
0215 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0216 iv[i] = -iv[i] - 1;
0217 }
0218
0219 nvIntermediate = nvFinal = foundClusters;
0220
0221 if (verbose && 0 == threadIdx.x)
0222 printf("found %d proto vertices\n", foundClusters);
0223 }
0224
0225 __global__ void clusterTracksByDensityKernel(VtxSoAView pdata,
0226 WsSoAView pws,
0227 int minT,
0228 float eps,
0229 float errmax,
0230 float chi2max
0231 ) {
0232 clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
0233 }
0234
0235 }
0236
0237 #endif