File indexing completed on 2021-05-17 03:00:32
0001 #ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
0002 #define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
0003
0004 #include <algorithm>
0005 #include <cmath>
0006 #include <cstdint>
0007
0008 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0010
0011 #include "gpuVertexFinder.h"
0012
0013 namespace gpuVertexFinder {
0014
0015
0016
0017 __global__ void clusterTracksIterative(ZVertices* pdata,
0018 WorkSpace* pws,
0019 int minT,
0020 float eps,
0021 float errmax,
0022 float chi2max
0023 ) {
0024 constexpr bool verbose = false;
0025
0026 if (verbose && 0 == threadIdx.x)
0027 printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
0028
0029 auto er2mx = errmax * errmax;
0030
0031 auto& __restrict__ data = *pdata;
0032 auto& __restrict__ ws = *pws;
0033 auto nt = ws.ntrks;
0034 float const* __restrict__ zt = ws.zt;
0035 float const* __restrict__ ezt2 = ws.ezt2;
0036
0037 uint32_t& nvFinal = data.nvFinal;
0038 uint32_t& nvIntermediate = ws.nvIntermediate;
0039
0040 uint8_t* __restrict__ izt = ws.izt;
0041 int32_t* __restrict__ nn = data.ndof;
0042 int32_t* __restrict__ iv = ws.iv;
0043
0044 assert(pdata);
0045 assert(zt);
0046
0047 using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
0048 __shared__ Hist hist;
0049 __shared__ typename Hist::Counter hws[32];
0050 for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
0051 hist.off[j] = 0;
0052 }
0053 __syncthreads();
0054
0055 if (verbose && 0 == threadIdx.x)
0056 printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
0057
0058 assert((int)nt <= hist.capacity());
0059
0060
0061 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0062 assert(i < ZVertices::MAXTRACKS);
0063 int iz = int(zt[i] * 10.);
0064 iz = std::clamp(iz, INT8_MIN, INT8_MAX);
0065 izt[i] = iz - INT8_MIN;
0066 assert(iz - INT8_MIN >= 0);
0067 assert(iz - INT8_MIN < 256);
0068 hist.count(izt[i]);
0069 iv[i] = i;
0070 nn[i] = 0;
0071 }
0072 __syncthreads();
0073 if (threadIdx.x < 32)
0074 hws[threadIdx.x] = 0;
0075 __syncthreads();
0076 hist.finalize(hws);
0077 __syncthreads();
0078 assert(hist.size() == nt);
0079 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0080 hist.fill(izt[i], uint16_t(i));
0081 }
0082 __syncthreads();
0083
0084
0085 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0086 if (ezt2[i] > er2mx)
0087 continue;
0088 auto loop = [&](uint32_t j) {
0089 if (i == j)
0090 return;
0091 auto dist = std::abs(zt[i] - zt[j]);
0092 if (dist > eps)
0093 return;
0094 if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0095 return;
0096 nn[i]++;
0097 };
0098
0099 cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0100 }
0101
0102 __shared__ int nloops;
0103 nloops = 0;
0104
0105 __syncthreads();
0106
0107
0108 bool more = true;
0109 while (__syncthreads_or(more)) {
0110 if (1 == nloops % 2) {
0111 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0112 auto m = iv[i];
0113 while (m != iv[m])
0114 m = iv[m];
0115 iv[i] = m;
0116 }
0117 } else {
0118 more = false;
0119 for (auto k = threadIdx.x; k < hist.size(); k += blockDim.x) {
0120 auto p = hist.begin() + k;
0121 auto i = (*p);
0122 auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1));
0123 if (nn[i] < minT)
0124 continue;
0125 auto loop = [&](uint32_t j) {
0126 assert(i != j);
0127 if (nn[j] < minT)
0128 return;
0129 auto dist = std::abs(zt[i] - zt[j]);
0130 if (dist > eps)
0131 return;
0132 if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0133 return;
0134 auto old = atomicMin(&iv[j], iv[i]);
0135 if (old != iv[i]) {
0136
0137 more = true;
0138 }
0139 atomicMin(&iv[i], old);
0140 };
0141 ++p;
0142 for (; p < hist.end(be); ++p)
0143 loop(*p);
0144 }
0145 }
0146 if (threadIdx.x == 0)
0147 ++nloops;
0148 }
0149
0150
0151 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0152
0153 if (nn[i] >= minT)
0154 continue;
0155 float mdist = eps;
0156 auto loop = [&](int j) {
0157 if (nn[j] < minT)
0158 return;
0159 auto dist = std::abs(zt[i] - zt[j]);
0160 if (dist > mdist)
0161 return;
0162 if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
0163 return;
0164 mdist = dist;
0165 iv[i] = iv[j];
0166 };
0167 cms::cuda::forEachInBins(hist, izt[i], 1, loop);
0168 }
0169
0170 __shared__ unsigned int foundClusters;
0171 foundClusters = 0;
0172 __syncthreads();
0173
0174
0175
0176 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0177 if (iv[i] == int(i)) {
0178 if (nn[i] >= minT) {
0179 auto old = atomicInc(&foundClusters, 0xffffffff);
0180 iv[i] = -(old + 1);
0181 } else {
0182 iv[i] = -9998;
0183 }
0184 }
0185 }
0186 __syncthreads();
0187
0188 assert(foundClusters < ZVertices::MAXVTX);
0189
0190
0191 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0192 if (iv[i] >= 0) {
0193
0194 iv[i] = iv[iv[i]];
0195 }
0196 }
0197 __syncthreads();
0198
0199
0200 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0201 iv[i] = -iv[i] - 1;
0202 }
0203
0204 nvIntermediate = nvFinal = foundClusters;
0205
0206 if (verbose && 0 == threadIdx.x)
0207 printf("found %d proto vertices\n", foundClusters);
0208 }
0209
0210 }
0211
0212 #endif