Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2021-11-11 03:57:21

0001 #include <cmath>
0002 #include <cstdint>
0003 #include <iostream>
0004 #include <random>
0005 #include <vector>
0006 
0007 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0008 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0009 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
0010 #ifdef USE_DBSCAN
0011 #include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h"
0012 #define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN
0013 #elif USE_ITERATIVE
0014 #include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h"
0015 #define CLUSTERIZE gpuVertexFinder::clusterTracksIterative
0016 #else
0017 #include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h"
0018 #define CLUSTERIZE gpuVertexFinder::clusterTracksByDensityKernel
0019 #endif
0020 #include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h"
0021 #include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h"
0022 #include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h"
0023 
0024 #ifdef ONE_KERNEL
0025 #ifdef __CUDACC__
0026 __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
0027                                       gpuVertexFinder::WorkSpace* pws,
0028                                       int minT,      // min number of neighbours to be "seed"
0029                                       float eps,     // max absolute distance to cluster
0030                                       float errmax,  // max error to be "seed"
0031                                       float chi2max  // max normalized distance to cluster,
0032 ) {
0033   clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
0034   __syncthreads();
0035   fitVertices(pdata, pws, 50.);
0036   __syncthreads();
0037   splitVertices(pdata, pws, 9.f);
0038   __syncthreads();
0039   fitVertices(pdata, pws, 5000.);
0040   __syncthreads();
0041   sortByPt2(pdata, pws);
0042 }
0043 #endif
0044 #endif
0045 
0046 struct Event {
0047   std::vector<float> zvert;
0048   std::vector<uint16_t> itrack;
0049   std::vector<float> ztrack;
0050   std::vector<float> eztrack;
0051   std::vector<float> pttrack;
0052   std::vector<uint16_t> ivert;
0053 };
0054 
0055 struct ClusterGenerator {
0056   explicit ClusterGenerator(float nvert, float ntrack)
0057       : rgen(-13., 13), errgen(0.005, 0.025), clusGen(nvert), trackGen(ntrack), gauss(0., 1.), ptGen(1.) {}
0058 
0059   void operator()(Event& ev) {
0060     int nclus = clusGen(reng);
0061     ev.zvert.resize(nclus);
0062     ev.itrack.resize(nclus);
0063     for (auto& z : ev.zvert) {
0064       z = 3.5f * gauss(reng);
0065     }
0066 
0067     ev.ztrack.clear();
0068     ev.eztrack.clear();
0069     ev.ivert.clear();
0070     ev.pttrack.clear();
0071     for (int iv = 0; iv < nclus; ++iv) {
0072       auto nt = trackGen(reng);
0073       ev.itrack[iv] = nt;
0074       for (int it = 0; it < nt; ++it) {
0075         auto err = errgen(reng);  // reality is not flat....
0076         ev.ztrack.push_back(ev.zvert[iv] + err * gauss(reng));
0077         ev.eztrack.push_back(err * err);
0078         ev.ivert.push_back(iv);
0079         ev.pttrack.push_back((iv == 5 ? 1.f : 0.5f) + ptGen(reng));
0080         ev.pttrack.back() *= ev.pttrack.back();
0081       }
0082     }
0083     // add noise
0084     auto nt = 2 * trackGen(reng);
0085     for (int it = 0; it < nt; ++it) {
0086       auto err = 0.03f;
0087       ev.ztrack.push_back(rgen(reng));
0088       ev.eztrack.push_back(err * err);
0089       ev.ivert.push_back(9999);
0090       ev.pttrack.push_back(0.5f + ptGen(reng));
0091       ev.pttrack.back() *= ev.pttrack.back();
0092     }
0093   }
0094 
0095   std::mt19937 reng;
0096   std::uniform_real_distribution<float> rgen;
0097   std::uniform_real_distribution<float> errgen;
0098   std::poisson_distribution<int> clusGen;
0099   std::poisson_distribution<int> trackGen;
0100   std::normal_distribution<float> gauss;
0101   std::exponential_distribution<float> ptGen;
0102 };
0103 
0104 // a macro SORRY
0105 #define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(gpuVertexFinder::ZVertices, M))
0106 #define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(gpuVertexFinder::WorkSpace, M))
0107 
0108 __global__ void print(gpuVertexFinder::ZVertices const* pdata, gpuVertexFinder::WorkSpace const* pws) {
0109   auto const& __restrict__ data = *pdata;
0110   auto const& __restrict__ ws = *pws;
0111   printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate);
0112 }
0113 
0114 int main() {
0115 #ifdef __CUDACC__
0116   cms::cudatest::requireDevices();
0117 
0118   auto onGPU_d = cms::cuda::make_device_unique<gpuVertexFinder::ZVertices[]>(1, nullptr);
0119   auto ws_d = cms::cuda::make_device_unique<gpuVertexFinder::WorkSpace[]>(1, nullptr);
0120 #else
0121   auto onGPU_d = std::make_unique<gpuVertexFinder::ZVertices>();
0122   auto ws_d = std::make_unique<gpuVertexFinder::WorkSpace>();
0123 #endif
0124 
0125   Event ev;
0126 
0127   float eps = 0.1f;
0128   std::array<float, 3> par{{eps, 0.01f, 9.0f}};
0129   for (int nav = 30; nav < 80; nav += 20) {
0130     ClusterGenerator gen(nav, 10);
0131 
0132     for (int i = 8; i < 20; ++i) {
0133       auto kk = i / 4;  // M param
0134 
0135       gen(ev);
0136 
0137 #ifdef __CUDACC__
0138       init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
0139 #else
0140       onGPU_d->init();
0141       ws_d->init();
0142 #endif
0143 
0144       std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
0145       auto nt = ev.ztrack.size();
0146 #ifdef __CUDACC__
0147       cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
0148       cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
0149       cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
0150       cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
0151 #else
0152       ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
0153       ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
0154       ::memcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size());
0155       ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size());
0156 #endif
0157 
0158       std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl;
0159 
0160       if ((i % 4) == 0)
0161         par = {{eps, 0.02f, 12.0f}};
0162       if ((i % 4) == 1)
0163         par = {{eps, 0.02f, 9.0f}};
0164       if ((i % 4) == 2)
0165         par = {{eps, 0.01f, 9.0f}};
0166       if ((i % 4) == 3)
0167         par = {{0.7f * eps, 0.01f, 9.0f}};
0168 
0169       uint32_t nv = 0;
0170 #ifdef __CUDACC__
0171       print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
0172       cudaCheck(cudaGetLastError());
0173       cudaDeviceSynchronize();
0174 
0175 #ifdef ONE_KERNEL
0176       cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
0177 #else
0178       cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
0179 #endif
0180       print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
0181 
0182       cudaCheck(cudaGetLastError());
0183       cudaDeviceSynchronize();
0184 
0185       cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
0186       cudaCheck(cudaGetLastError());
0187       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
0188 
0189 #else
0190       print(onGPU_d.get(), ws_d.get());
0191       CLUSTERIZE(onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
0192       print(onGPU_d.get(), ws_d.get());
0193       fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
0194       nv = onGPU_d->nvFinal;
0195 #endif
0196 
0197       if (nv == 0) {
0198         std::cout << "NO VERTICES???" << std::endl;
0199         continue;
0200       }
0201 
0202       float* zv = nullptr;
0203       float* wv = nullptr;
0204       float* ptv2 = nullptr;
0205       int32_t* nn = nullptr;
0206       uint16_t* ind = nullptr;
0207 
0208       // keep chi2 separated...
0209       float chi2[2 * nv];  // make space for splitting...
0210 
0211 #ifdef __CUDACC__
0212       float hzv[2 * nv];
0213       float hwv[2 * nv];
0214       float hptv2[2 * nv];
0215       int32_t hnn[2 * nv];
0216       uint16_t hind[2 * nv];
0217 
0218       zv = hzv;
0219       wv = hwv;
0220       ptv2 = hptv2;
0221       nn = hnn;
0222       ind = hind;
0223 #else
0224       zv = onGPU_d->zv;
0225       wv = onGPU_d->wv;
0226       ptv2 = onGPU_d->ptv2;
0227       nn = onGPU_d->ndof;
0228       ind = onGPU_d->sortInd;
0229 #endif
0230 
0231 #ifdef __CUDACC__
0232       cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
0233       cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
0234 #else
0235       memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
0236 #endif
0237 
0238       for (auto j = 0U; j < nv; ++j)
0239         if (nn[j] > 0)
0240           chi2[j] /= float(nn[j]);
0241       {
0242         auto mx = std::minmax_element(chi2, chi2 + nv);
0243         std::cout << "after fit nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
0244       }
0245 
0246 #ifdef __CUDACC__
0247       cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
0248       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
0249       cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
0250       cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
0251 #else
0252       fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
0253       nv = onGPU_d->nvFinal;
0254       memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
0255 #endif
0256 
0257       for (auto j = 0U; j < nv; ++j)
0258         if (nn[j] > 0)
0259           chi2[j] /= float(nn[j]);
0260       {
0261         auto mx = std::minmax_element(chi2, chi2 + nv);
0262         std::cout << "before splitting nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
0263       }
0264 
0265 #ifdef __CUDACC__
0266       // one vertex per block!!!
0267       cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
0268       cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
0269 #else
0270       splitVertices(onGPU_d.get(), ws_d.get(), 9.f);
0271       nv = ws_d->nvIntermediate;
0272 #endif
0273       std::cout << "after split " << nv << std::endl;
0274 
0275 #ifdef __CUDACC__
0276       cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
0277       cudaCheck(cudaGetLastError());
0278 
0279       cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
0280       cudaCheck(cudaGetLastError());
0281       cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
0282 #else
0283       fitVertices(onGPU_d.get(), ws_d.get(), 5000.f);
0284       sortByPt2(onGPU_d.get(), ws_d.get());
0285       nv = onGPU_d->nvFinal;
0286       memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
0287 #endif
0288 
0289       if (nv == 0) {
0290         std::cout << "NO VERTICES???" << std::endl;
0291         continue;
0292       }
0293 
0294 #ifdef __CUDACC__
0295       cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost));
0296       cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost));
0297       cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
0298       cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost));
0299       cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
0300       cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost));
0301 #endif
0302       for (auto j = 0U; j < nv; ++j)
0303         if (nn[j] > 0)
0304           chi2[j] /= float(nn[j]);
0305       {
0306         auto mx = std::minmax_element(chi2, chi2 + nv);
0307         std::cout << "nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
0308       }
0309 
0310       {
0311         auto mx = std::minmax_element(wv, wv + nv);
0312         std::cout << "min max error " << 1. / std::sqrt(*mx.first) << ' ' << 1. / std::sqrt(*mx.second) << std::endl;
0313       }
0314 
0315       {
0316         auto mx = std::minmax_element(ptv2, ptv2 + nv);
0317         std::cout << "min max ptv2 " << *mx.first << ' ' << *mx.second << std::endl;
0318         std::cout << "min max ptv2 " << ptv2[ind[0]] << ' ' << ptv2[ind[nv - 1]] << " at " << ind[0] << ' '
0319                   << ind[nv - 1] << std::endl;
0320       }
0321 
0322       float dd[nv];
0323       for (auto kv = 0U; kv < nv; ++kv) {
0324         auto zr = zv[kv];
0325         auto md = 500.0f;
0326         for (auto zs : ev.ztrack) {
0327           auto d = std::abs(zr - zs);
0328           md = std::min(d, md);
0329         }
0330         dd[kv] = md;
0331       }
0332       if (i == 6) {
0333         for (auto d : dd)
0334           std::cout << d << ' ';
0335         std::cout << std::endl;
0336       }
0337       auto mx = std::minmax_element(dd, dd + nv);
0338       float rms = 0;
0339       for (auto d : dd)
0340         rms += d * d;
0341       rms = std::sqrt(rms) / (nv - 1);
0342       std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl;
0343 
0344     }  // loop on events
0345   }    // lopp on ave vert
0346 
0347   return 0;
0348 }