File indexing completed on 2024-09-07 04:38:05
0001 #ifndef RecoVertex_PixelVertexFinding_plugins_gpuFitVertices_h
0002 #define RecoVertex_PixelVertexFinding_plugins_gpuFitVertices_h
0003
0004 #include <algorithm>
0005 #include <cmath>
0006 #include <cstdint>
0007
0008 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0010
0011 #include "gpuVertexFinder.h"
0012
0013 namespace gpuVertexFinder {
0014
0015 __device__ __forceinline__ void fitVertices(VtxSoAView& pdata,
0016 WsSoAView& pws,
0017 float chi2Max
0018 ) {
0019 constexpr bool verbose = false;
0020
0021 auto& __restrict__ data = pdata;
0022 auto& __restrict__ ws = pws;
0023 auto nt = ws.ntrks();
0024 float const* __restrict__ zt = ws.zt();
0025 float const* __restrict__ ezt2 = ws.ezt2();
0026 float* __restrict__ zv = data.zv();
0027 float* __restrict__ wv = data.wv();
0028 float* __restrict__ chi2 = data.chi2();
0029 uint32_t& nvFinal = data.nvFinal();
0030 uint32_t& nvIntermediate = ws.nvIntermediate();
0031
0032 int32_t* __restrict__ nn = data.ndof();
0033 int32_t* __restrict__ iv = ws.iv();
0034
0035 assert(nvFinal <= nvIntermediate);
0036 nvFinal = nvIntermediate;
0037 auto foundClusters = nvFinal;
0038
0039
0040 for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) {
0041 zv[i] = 0;
0042 wv[i] = 0;
0043 chi2[i] = 0;
0044 }
0045
0046
0047 __shared__ int noise;
0048 if (verbose && 0 == threadIdx.x)
0049 noise = 0;
0050
0051 __syncthreads();
0052
0053
0054 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0055 if (iv[i] > 9990) {
0056 if (verbose)
0057 atomicAdd(&noise, 1);
0058 continue;
0059 }
0060 assert(iv[i] >= 0);
0061 assert(iv[i] < int(foundClusters));
0062 auto w = 1.f / ezt2[i];
0063 atomicAdd_block(&zv[iv[i]], zt[i] * w);
0064 atomicAdd_block(&wv[iv[i]], w);
0065 }
0066
0067 __syncthreads();
0068
0069 for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) {
0070 assert(wv[i] > 0.f);
0071 zv[i] /= wv[i];
0072 nn[i] = -1;
0073 }
0074 __syncthreads();
0075
0076
0077 for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0078 if (iv[i] > 9990)
0079 continue;
0080
0081 auto c2 = zv[iv[i]] - zt[i];
0082 c2 *= c2 / ezt2[i];
0083 if (c2 > chi2Max) {
0084 iv[i] = 9999;
0085 continue;
0086 }
0087 atomicAdd_block(&chi2[iv[i]], c2);
0088 atomicAdd_block(&nn[iv[i]], 1);
0089 }
0090 __syncthreads();
0091 for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x)
0092 if (nn[i] > 0)
0093 wv[i] *= float(nn[i]) / chi2[i];
0094
0095 if (verbose && 0 == threadIdx.x)
0096 printf("found %d proto clusters ", foundClusters);
0097 if (verbose && 0 == threadIdx.x)
0098 printf("and %d noise\n", noise);
0099 }
0100
0101 __global__ void fitVerticesKernel(VtxSoAView pdata,
0102 WsSoAView pws,
0103 float chi2Max
0104 ) {
0105 fitVertices(pdata, pws, chi2Max);
0106 }
0107
0108 }
0109
0110 #endif