Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2021-04-07 01:19:49

0001 #ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h
0002 #define RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h
0003 
0004 #include <algorithm>
0005 #include <cmath>
0006 #include <cstdint>
0007 
0008 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0010 
0011 #include "gpuVertexFinder.h"
0012 
0013 namespace gpuVertexFinder {
0014 
0015   __device__ __forceinline__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
0016     constexpr bool verbose = false;  // in principle the compiler should optmize out if false
0017 
0018     auto& __restrict__ data = *pdata;
0019     auto& __restrict__ ws = *pws;
0020     auto nt = ws.ntrks;
0021     float const* __restrict__ zt = ws.zt;
0022     float const* __restrict__ ezt2 = ws.ezt2;
0023     float* __restrict__ zv = data.zv;
0024     float* __restrict__ wv = data.wv;
0025     float const* __restrict__ chi2 = data.chi2;
0026     uint32_t& nvFinal = data.nvFinal;
0027 
0028     int32_t const* __restrict__ nn = data.ndof;
0029     int32_t* __restrict__ iv = ws.iv;
0030 
0031     assert(pdata);
0032     assert(zt);
0033 
0034     // one vertex per block
0035     for (auto kv = blockIdx.x; kv < nvFinal; kv += gridDim.x) {
0036       if (nn[kv] < 4)
0037         continue;
0038       if (chi2[kv] < maxChi2 * float(nn[kv]))
0039         continue;
0040 
0041       constexpr int MAXTK = 512;
0042       assert(nn[kv] < MAXTK);
0043       if (nn[kv] >= MAXTK)
0044         continue;                      // too bad FIXME
0045       __shared__ uint32_t it[MAXTK];   // track index
0046       __shared__ float zz[MAXTK];      // z pos
0047       __shared__ uint8_t newV[MAXTK];  // 0 or 1
0048       __shared__ float ww[MAXTK];      // z weight
0049 
0050       __shared__ uint32_t nq;  // number of track for this vertex
0051       nq = 0;
0052       __syncthreads();
0053 
0054       // copy to local
0055       for (auto k = threadIdx.x; k < nt; k += blockDim.x) {
0056         if (iv[k] == int(kv)) {
0057           auto old = atomicInc(&nq, MAXTK);
0058           zz[old] = zt[k] - zv[kv];
0059           newV[old] = zz[old] < 0 ? 0 : 1;
0060           ww[old] = 1.f / ezt2[k];
0061           it[old] = k;
0062         }
0063       }
0064 
0065       __shared__ float znew[2], wnew[2];  // the new vertices
0066 
0067       __syncthreads();
0068       assert(int(nq) == nn[kv] + 1);
0069 
0070       int maxiter = 20;
0071       // kt-min....
0072       bool more = true;
0073       while (__syncthreads_or(more)) {
0074         more = false;
0075         if (0 == threadIdx.x) {
0076           znew[0] = 0;
0077           znew[1] = 0;
0078           wnew[0] = 0;
0079           wnew[1] = 0;
0080         }
0081         __syncthreads();
0082         for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
0083           auto i = newV[k];
0084           atomicAdd(&znew[i], zz[k] * ww[k]);
0085           atomicAdd(&wnew[i], ww[k]);
0086         }
0087         __syncthreads();
0088         if (0 == threadIdx.x) {
0089           znew[0] /= wnew[0];
0090           znew[1] /= wnew[1];
0091         }
0092         __syncthreads();
0093         for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
0094           auto d0 = fabs(zz[k] - znew[0]);
0095           auto d1 = fabs(zz[k] - znew[1]);
0096           auto newer = d0 < d1 ? 0 : 1;
0097           more |= newer != newV[k];
0098           newV[k] = newer;
0099         }
0100         --maxiter;
0101         if (maxiter <= 0)
0102           more = false;
0103       }
0104 
0105       // avoid empty vertices
0106       if (0 == wnew[0] || 0 == wnew[1])
0107         continue;
0108 
0109       // quality cut
0110       auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]);
0111 
0112       auto chi2Dist = dist2 / (1.f / wnew[0] + 1.f / wnew[1]);
0113 
0114       if (verbose && 0 == threadIdx.x)
0115         printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]);
0116 
0117       if (chi2Dist < 4)
0118         continue;
0119 
0120       // get a new global vertex
0121       __shared__ uint32_t igv;
0122       if (0 == threadIdx.x)
0123         igv = atomicAdd(&ws.nvIntermediate, 1);
0124       __syncthreads();
0125       for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
0126         if (1 == newV[k])
0127           iv[it[k]] = igv;
0128       }
0129 
0130     }  // loop on vertices
0131   }
0132 
0133   __global__ void splitVerticesKernel(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
0134     splitVertices(pdata, pws, maxChi2);
0135   }
0136 
0137 }  // namespace gpuVertexFinder
0138 
0139 #endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h