File indexing completed on 2024-09-07 04:38:05
0001 #ifndef RecoVertex_PixelVertexFinding_plugins_gpuSplitVertices_h
0002 #define RecoVertex_PixelVertexFinding_plugins_gpuSplitVertices_h
0003
0004 #include <algorithm>
0005 #include <cmath>
0006 #include <cstdint>
0007
0008 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0010
0011 #include "gpuVertexFinder.h"
0012
0013 namespace gpuVertexFinder {
0014
0015 __device__ __forceinline__ void splitVertices(VtxSoAView& pdata, WsSoAView& pws, float maxChi2) {
0016 constexpr bool verbose = false;
0017
0018 auto& __restrict__ data = pdata;
0019 auto& __restrict__ ws = pws;
0020 auto nt = ws.ntrks();
0021 float const* __restrict__ zt = ws.zt();
0022 float const* __restrict__ ezt2 = ws.ezt2();
0023 float* __restrict__ zv = data.zv();
0024 float* __restrict__ wv = data.wv();
0025 float const* __restrict__ chi2 = data.chi2();
0026 uint32_t& nvFinal = data.nvFinal();
0027
0028 int32_t const* __restrict__ nn = data.ndof();
0029 int32_t* __restrict__ iv = ws.iv();
0030
0031 assert(zt);
0032 assert(wv);
0033 assert(chi2);
0034 assert(nn);
0035
0036
0037 for (auto kv = blockIdx.x; kv < nvFinal; kv += gridDim.x) {
0038 if (nn[kv] < 4)
0039 continue;
0040 if (chi2[kv] < maxChi2 * float(nn[kv]))
0041 continue;
0042
0043 constexpr int MAXTK = 512;
0044 assert(nn[kv] < MAXTK);
0045 if (nn[kv] >= MAXTK)
0046 continue;
0047 __shared__ uint32_t it[MAXTK];
0048 __shared__ float zz[MAXTK];
0049 __shared__ uint8_t newV[MAXTK];
0050 __shared__ float ww[MAXTK];
0051
0052 __shared__ uint32_t nq;
0053 nq = 0;
0054 __syncthreads();
0055
0056
0057 for (auto k = threadIdx.x; k < nt; k += blockDim.x) {
0058 if (iv[k] == int(kv)) {
0059 auto old = atomicInc(&nq, MAXTK);
0060 zz[old] = zt[k] - zv[kv];
0061 newV[old] = zz[old] < 0 ? 0 : 1;
0062 ww[old] = 1.f / ezt2[k];
0063 it[old] = k;
0064 }
0065 }
0066
0067 __shared__ float znew[2], wnew[2];
0068
0069 __syncthreads();
0070 assert(int(nq) == nn[kv] + 1);
0071
0072 int maxiter = 20;
0073
0074 bool more = true;
0075 while (__syncthreads_or(more)) {
0076 more = false;
0077 if (0 == threadIdx.x) {
0078 znew[0] = 0;
0079 znew[1] = 0;
0080 wnew[0] = 0;
0081 wnew[1] = 0;
0082 }
0083 __syncthreads();
0084 for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
0085 auto i = newV[k];
0086 atomicAdd(&znew[i], zz[k] * ww[k]);
0087 atomicAdd(&wnew[i], ww[k]);
0088 }
0089 __syncthreads();
0090 if (0 == threadIdx.x) {
0091 znew[0] /= wnew[0];
0092 znew[1] /= wnew[1];
0093 }
0094 __syncthreads();
0095 for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
0096 auto d0 = fabs(zz[k] - znew[0]);
0097 auto d1 = fabs(zz[k] - znew[1]);
0098 auto newer = d0 < d1 ? 0 : 1;
0099 more |= newer != newV[k];
0100 newV[k] = newer;
0101 }
0102 --maxiter;
0103 if (maxiter <= 0)
0104 more = false;
0105 }
0106
0107
0108 if (0 == wnew[0] || 0 == wnew[1])
0109 continue;
0110
0111
0112 auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]);
0113
0114 auto chi2Dist = dist2 / (1.f / wnew[0] + 1.f / wnew[1]);
0115
0116 if (verbose && 0 == threadIdx.x)
0117 printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]);
0118
0119 if (chi2Dist < 4)
0120 continue;
0121
0122
0123 __shared__ uint32_t igv;
0124 if (0 == threadIdx.x)
0125 igv = atomicAdd(&ws.nvIntermediate(), 1);
0126 __syncthreads();
0127 for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
0128 if (1 == newV[k])
0129 iv[it[k]] = igv;
0130 }
0131
0132 }
0133 }
0134
0135 __global__ void splitVerticesKernel(VtxSoAView pdata, WsSoAView pws, float maxChi2) {
0136 splitVertices(pdata, pws, maxChi2);
0137 }
0138
0139 }
0140
0141 #endif