PixelVertexFinding/plugins/gpuFitVertices.h

0001 #ifndef RecoVertex_PixelVertexFinding_plugins_gpuFitVertices_h
0002 #define RecoVertex_PixelVertexFinding_plugins_gpuFitVertices_h
0003
0004 #include <algorithm>
0005 #include <cmath>
0006 #include <cstdint>
0007
0008 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
0010
0011 #include "gpuVertexFinder.h"
0012
0013 namespace gpuVertexFinder {
0014
0015   __device__ __forceinline__ void fitVertices(VtxSoAView& pdata,
0016                                               WsSoAView& pws,
0017                                               float chi2Max  // for outlier rejection
0018   ) {
0019     constexpr bool verbose = false;  // in principle the compiler should optmize out if false
0020
0021     auto& __restrict__ data = pdata;
0022     auto& __restrict__ ws = pws;
0023     auto nt = ws.ntrks();
0024     float const* __restrict__ zt = ws.zt();
0025     float const* __restrict__ ezt2 = ws.ezt2();
0026     float* __restrict__ zv = data.zv();
0027     float* __restrict__ wv = data.wv();
0028     float* __restrict__ chi2 = data.chi2();
0029     uint32_t& nvFinal = data.nvFinal();
0030     uint32_t& nvIntermediate = ws.nvIntermediate();
0031
0032     int32_t* __restrict__ nn = data.ndof();
0033     int32_t* __restrict__ iv = ws.iv();
0034
0035     assert(nvFinal <= nvIntermediate);
0036     nvFinal = nvIntermediate;
0037     auto foundClusters = nvFinal;
0038
0039     // zero
0040     for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) {
0041       zv[i] = 0;
0042       wv[i] = 0;
0043       chi2[i] = 0;
0044     }
0045
0046     // only for test
0047     __shared__ int noise;
0048     if (verbose && 0 == threadIdx.x)
0049       noise = 0;
0050
0051     __syncthreads();
0052
0053     // compute cluster location
0054     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0055       if (iv[i] > 9990) {
0056         if (verbose)
0057           atomicAdd(&noise, 1);
0058         continue;
0059       }
0060       assert(iv[i] >= 0);
0061       assert(iv[i] < int(foundClusters));
0062       auto w = 1.f / ezt2[i];
0063       atomicAdd_block(&zv[iv[i]], zt[i] * w);
0064       atomicAdd_block(&wv[iv[i]], w);
0065     }
0066
0067     __syncthreads();
0068     // reuse nn
0069     for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) {
0070       assert(wv[i] > 0.f);
0071       zv[i] /= wv[i];
0072       nn[i] = -1;  // ndof
0073     }
0074     __syncthreads();
0075
0076     // compute chi2
0077     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
0078       if (iv[i] > 9990)
0079         continue;
0080
0081       auto c2 = zv[iv[i]] - zt[i];
0082       c2 *= c2 / ezt2[i];
0083       if (c2 > chi2Max) {
0084         iv[i] = 9999;
0085         continue;
0086       }
0087       atomicAdd_block(&chi2[iv[i]], c2);
0088       atomicAdd_block(&nn[iv[i]], 1);
0089     }
0090     __syncthreads();
0091     for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x)
0092       if (nn[i] > 0)
0093         wv[i] *= float(nn[i]) / chi2[i];
0094
0095     if (verbose && 0 == threadIdx.x)
0096       printf("found %d proto clusters ", foundClusters);
0097     if (verbose && 0 == threadIdx.x)
0098       printf("and %d noise\n", noise);
0099   }
0100
0101   __global__ void fitVerticesKernel(VtxSoAView pdata,
0102                                     WsSoAView pws,
0103                                     float chi2Max  // for outlier rejection
0104   ) {
0105     fitVertices(pdata, pws, chi2Max);
0106   }
0107
0108 }  // namespace gpuVertexFinder
0109
0110 #endif  // RecoVertex_PixelVertexFinding_plugins_gpuFitVertices_h