File indexing completed on 2024-09-07 04:38:05
0001 #ifndef RecoVertex_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
0002 #define RecoVertex_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
0003
0004 #include <algorithm>
0005 #include <cmath>
0006 #include <cstdint>
0007
0008 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0010
0011 #include "gpuVertexFinder.h"
0012
0013 namespace gpuVertexFinder {
0014
0015
0016
0017 __global__ void clusterTracksIterative(VtxSoAView pdata,
0018 WsSoAView pws,
0019 int minT,
0020 float eps,
0021 float errmax,
0022 float chi2max
0023 ) {
0024 constexpr bool verbose = false;
0025
0026 if (verbose && 0 == threadIdx.x)
0027 printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
0028
0029 auto er2mx = errmax * errmax;
0030
0031 auto& __restrict__ data = pdata;
0032 auto& __restrict__ ws = pws;
0033 auto nt = ws.ntrks();
0034 float const* __restrict__ zt = ws.zt();
0035 float const* __restrict__ ezt2 = ws.ezt2();
0036
0037 uint32_t& nvFinal = data.nvFinal();
0038 uint32_t& nvIntermediate = ws.nvIntermediate();
0039
0040 uint8_t* __restrict__ izt = ws.izt();
0041 int32_t* __restrict__ nn = data.ndof();
0042 int32_t* __restrict__ iv = ws.iv();
0043
0044 assert(zt);
0045 assert(nn);
0046 assert(iv);
0047 assert(ezt2);
0048
0049 using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
0050 __shared__ Hist hist;
0051 __shared__ typename Hist::Counter hws[32];
0052 for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
0053 hist.off[j] = 0;
0054 }
0055 __syncthreads();
0056
0057 if (verbose && 0 == threadIdx.x)
0058 printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
0059
0060 assert((int)nt <= hist.capacity());
0061
0062
0063 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0064 assert(i < zVertex::utilities::MAXTRACKS);
0065 int iz = int(zt[i] * 10.);
0066 iz = std::clamp(iz, INT8_MIN, INT8_MAX);
0067 izt[i] = iz - INT8_MIN;
0068 assert(iz - INT8_MIN >= 0);
0069 assert(iz - INT8_MIN < 256);
0070 hist.count(izt[i]);
0071 iv[i] = i;
0072 nn[i] = 0;
0073 }
0074 __syncthreads();
0075 if (threadIdx.x < 32)
0076 hws[threadIdx.x] = 0;
0077 __syncthreads();
0078 hist.finalize(hws);
0079 __syncthreads();
0080 assert(hist.size() == nt);
0081 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0082 hist.fill(izt[i], uint16_t(i));
0083 }
0084 __syncthreads();
0085
0086
0087 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0088 if (ezt2[i] > er2mx)
0089 continue;
0090 auto loop = [&](uint32_t j) {
0091 if (i == j)
0092 return;
0093 auto dist = std::abs(zt[i] - zt[j]);
0094 if (dist > eps)
0095 return;
0096 if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0097 return;
0098 nn[i]++;
0099 };
0100
0101 cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0102 }
0103
0104 __shared__ int nloops;
0105 nloops = 0;
0106
0107 __syncthreads();
0108
0109
0110 bool more = true;
0111 while (__syncthreads_or(more)) {
0112 if (1 == nloops % 2) {
0113 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0114 auto m = iv[i];
0115 while (m != iv[m])
0116 m = iv[m];
0117 iv[i] = m;
0118 }
0119 } else {
0120 more = false;
0121 for (auto k = threadIdx.x; k < hist.size(); k += blockDim.x) {
0122 auto p = hist.begin() + k;
0123 auto i = (*p);
0124 auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1));
0125 if (nn[i] < minT)
0126 continue;
0127 auto loop = [&](uint32_t j) {
0128 assert(i != j);
0129 if (nn[j] < minT)
0130 return;
0131 auto dist = std::abs(zt[i] - zt[j]);
0132 if (dist > eps)
0133 return;
0134 if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0135 return;
0136 auto old = atomicMin(&iv[j], iv[i]);
0137 if (old != iv[i]) {
0138
0139 more = true;
0140 }
0141 atomicMin(&iv[i], old);
0142 };
0143 ++p;
0144 for (; p < hist.end(be); ++p)
0145 loop(*p);
0146 }
0147 }
0148 if (threadIdx.x == 0)
0149 ++nloops;
0150 }
0151
0152
0153 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0154
0155 if (nn[i] >= minT)
0156 continue;
0157 float mdist = eps;
0158 auto loop = [&](int j) {
0159 if (nn[j] < minT)
0160 return;
0161 auto dist = std::abs(zt[i] - zt[j]);
0162 if (dist > mdist)
0163 return;
0164 if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0165 return;
0166 mdist = dist;
0167 iv[i] = iv[j];
0168 };
0169 cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0170 }
0171
0172 __shared__ unsigned int foundClusters;
0173 foundClusters = 0;
0174 __syncthreads();
0175
0176
0177
0178 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0179 if (iv[i] == int(i)) {
0180 if (nn[i] >= minT) {
0181 auto old = atomicInc(&foundClusters, 0xffffffff);
0182 iv[i] = -(old + 1);
0183 } else {
0184 iv[i] = -9998;
0185 }
0186 }
0187 }
0188 __syncthreads();
0189
0190 assert(foundClusters < zVertex::utilities::MAXVTX);
0191
0192
0193 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0194 if (iv[i] >= 0) {
0195
0196 iv[i] = iv[iv[i]];
0197 }
0198 }
0199 __syncthreads();
0200
0201
0202 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0203 iv[i] = -iv[i] - 1;
0204 }
0205
0206 nvIntermediate = nvFinal = foundClusters;
0207
0208 if (verbose && 0 == threadIdx.x)
0209 printf("found %d proto vertices\n", foundClusters);
0210 }
0211
0212 }
0213
0214 #endif