Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2023-07-17 02:54:08

0001 #include <algorithm>
0002 #include <cassert>
0003 #include <cmath>
0004 #include <cstdint>
0005 #include <iomanip>
0006 #include <iostream>
0007 #include <memory>
0008 #include <numeric>
0009 #include <set>
0010 #include <vector>
0011 
0012 #ifdef __CUDACC__
0013 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0014 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0015 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
0016 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0017 #endif  // __CUDACC__
0018 
0019 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
0020 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
0021 #include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelClusterThresholds.h"
0022 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
0023 
0024 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
0025 
0026 int main(void) {
0027 #ifdef __CUDACC__
0028   cms::cudatest::requireDevices();
0029 #endif  // __CUDACC__
0030 
0031   using namespace gpuClustering;
0032   using pixelTopology::Phase1;
0033 
0034   constexpr int numElements = 256 * maxNumModules;
0035   const SiPixelClusterThresholds clusterThresholds(
0036       clusterThresholdLayerOne, clusterThresholdOtherLayers, 0.f, 0.f, 0.f, 0.f);
0037 
0038   // these in reality are already on GPU
0039   auto h_raw = std::make_unique<uint32_t[]>(numElements);
0040   auto h_id = std::make_unique<uint16_t[]>(numElements);
0041   auto h_x = std::make_unique<uint16_t[]>(numElements);
0042   auto h_y = std::make_unique<uint16_t[]>(numElements);
0043   auto h_adc = std::make_unique<uint16_t[]>(numElements);
0044   auto h_clus = std::make_unique<int[]>(numElements);
0045 
0046 #ifdef __CUDACC__
0047   auto d_raw = cms::cuda::make_device_unique<uint32_t[]>(numElements, nullptr);
0048   auto d_id = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
0049   auto d_x = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
0050   auto d_y = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
0051   auto d_adc = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
0052   auto d_clus = cms::cuda::make_device_unique<int[]>(numElements, nullptr);
0053   auto d_moduleStart = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules + 1, nullptr);
0054   auto d_clusInModule = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules, nullptr);
0055   auto d_moduleId = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules, nullptr);
0056 #else   // __CUDACC__
0057   auto h_moduleStart = std::make_unique<uint32_t[]>(maxNumModules + 1);
0058   auto h_clusInModule = std::make_unique<uint32_t[]>(maxNumModules);
0059   auto h_moduleId = std::make_unique<uint32_t[]>(maxNumModules);
0060 #endif  // __CUDACC__
0061 
0062   // later random number
0063   int n = 0;
0064   int ncl = 0;
0065   int y[10] = {5, 7, 9, 1, 3, 0, 4, 8, 2, 6};
0066 
0067   auto generateClusters = [&](int kn) {
0068     auto addBigNoise = 1 == kn % 2;
0069     if (addBigNoise) {
0070       constexpr int MaxPixels = 1000;
0071       int id = 666;
0072       for (int x = 0; x < 140; x += 3) {
0073         for (int yy = 0; yy < 400; yy += 3) {
0074           h_id[n] = id;
0075           h_x[n] = x;
0076           h_y[n] = yy;
0077           h_adc[n] = 1000;
0078           ++n;
0079           ++ncl;
0080           if (MaxPixels <= ncl)
0081             break;
0082         }
0083         if (MaxPixels <= ncl)
0084           break;
0085       }
0086     }
0087 
0088     {
0089       // isolated
0090       int id = 42;
0091       int x = 10;
0092       ++ncl;
0093       h_id[n] = id;
0094       h_x[n] = x;
0095       h_y[n] = x;
0096       h_adc[n] = kn == 0 ? 100 : 5000;
0097       ++n;
0098 
0099       // first column
0100       ++ncl;
0101       h_id[n] = id;
0102       h_x[n] = x;
0103       h_y[n] = 0;
0104       h_adc[n] = 5000;
0105       ++n;
0106       // first columns
0107       ++ncl;
0108       h_id[n] = id;
0109       h_x[n] = x + 80;
0110       h_y[n] = 2;
0111       h_adc[n] = 5000;
0112       ++n;
0113       h_id[n] = id;
0114       h_x[n] = x + 80;
0115       h_y[n] = 1;
0116       h_adc[n] = 5000;
0117       ++n;
0118 
0119       // last column
0120       ++ncl;
0121       h_id[n] = id;
0122       h_x[n] = x;
0123       h_y[n] = 415;
0124       h_adc[n] = 5000;
0125       ++n;
0126       // last columns
0127       ++ncl;
0128       h_id[n] = id;
0129       h_x[n] = x + 80;
0130       h_y[n] = 415;
0131       h_adc[n] = 2500;
0132       ++n;
0133       h_id[n] = id;
0134       h_x[n] = x + 80;
0135       h_y[n] = 414;
0136       h_adc[n] = 2500;
0137       ++n;
0138 
0139       // diagonal
0140       ++ncl;
0141       for (int x = 20; x < 25; ++x) {
0142         h_id[n] = id;
0143         h_x[n] = x;
0144         h_y[n] = x;
0145         h_adc[n] = 1000;
0146         ++n;
0147       }
0148       ++ncl;
0149       // reversed
0150       for (int x = 45; x > 40; --x) {
0151         h_id[n] = id;
0152         h_x[n] = x;
0153         h_y[n] = x;
0154         h_adc[n] = 1000;
0155         ++n;
0156       }
0157       ++ncl;
0158       h_id[n++] = invalidModuleId;  // error
0159       // messy
0160       int xx[5] = {21, 25, 23, 24, 22};
0161       for (int k = 0; k < 5; ++k) {
0162         h_id[n] = id;
0163         h_x[n] = xx[k];
0164         h_y[n] = 20 + xx[k];
0165         h_adc[n] = 1000;
0166         ++n;
0167       }
0168       // holes
0169       ++ncl;
0170       for (int k = 0; k < 5; ++k) {
0171         h_id[n] = id;
0172         h_x[n] = xx[k];
0173         h_y[n] = 100;
0174         h_adc[n] = kn == 2 ? 100 : 1000;
0175         ++n;
0176         if (xx[k] % 2 == 0) {
0177           h_id[n] = id;
0178           h_x[n] = xx[k];
0179           h_y[n] = 101;
0180           h_adc[n] = 1000;
0181           ++n;
0182         }
0183       }
0184     }
0185     {
0186       // id == 0 (make sure it works!
0187       int id = 0;
0188       int x = 10;
0189       ++ncl;
0190       h_id[n] = id;
0191       h_x[n] = x;
0192       h_y[n] = x;
0193       h_adc[n] = 5000;
0194       ++n;
0195     }
0196     // all odd id
0197     for (int id = 11; id <= 1800; id += 2) {
0198       if ((id / 20) % 2)
0199         h_id[n++] = invalidModuleId;  // error
0200       for (int x = 0; x < 40; x += 4) {
0201         ++ncl;
0202         if ((id / 10) % 2) {
0203           for (int k = 0; k < 10; ++k) {
0204             h_id[n] = id;
0205             h_x[n] = x;
0206             h_y[n] = x + y[k];
0207             h_adc[n] = 100;
0208             ++n;
0209             h_id[n] = id;
0210             h_x[n] = x + 1;
0211             h_y[n] = x + y[k] + 2;
0212             h_adc[n] = 1000;
0213             ++n;
0214           }
0215         } else {
0216           for (int k = 0; k < 10; ++k) {
0217             h_id[n] = id;
0218             h_x[n] = x;
0219             h_y[n] = x + y[9 - k];
0220             h_adc[n] = kn == 2 ? 10 : 1000;
0221             ++n;
0222             if (y[k] == 3)
0223               continue;  // hole
0224             if (id == 51) {
0225               h_id[n++] = invalidModuleId;
0226               h_id[n++] = invalidModuleId;
0227             }  // error
0228             h_id[n] = id;
0229             h_x[n] = x + 1;
0230             h_y[n] = x + y[k] + 2;
0231             h_adc[n] = kn == 2 ? 10 : 1000;
0232             ++n;
0233           }
0234         }
0235       }
0236     }
0237   };  // end lambda
0238   for (auto kkk = 0; kkk < 5; ++kkk) {
0239     n = 0;
0240     ncl = 0;
0241     generateClusters(kkk);
0242 
0243     std::cout << "created " << n << " digis in " << ncl << " clusters" << std::endl;
0244     assert(n <= numElements);
0245 
0246     uint32_t nModules = 0;
0247 #ifdef __CUDACC__
0248     size_t size32 = n * sizeof(unsigned int);
0249     size_t size16 = n * sizeof(unsigned short);
0250     // size_t size8 = n * sizeof(uint8_t);
0251 
0252     cudaCheck(cudaMemcpy(d_moduleStart.get(), &nModules, sizeof(uint32_t), cudaMemcpyHostToDevice));
0253     cudaCheck(cudaMemcpy(d_id.get(), h_id.get(), size16, cudaMemcpyHostToDevice));
0254     cudaCheck(cudaMemcpy(d_x.get(), h_x.get(), size16, cudaMemcpyHostToDevice));
0255     cudaCheck(cudaMemcpy(d_y.get(), h_y.get(), size16, cudaMemcpyHostToDevice));
0256     cudaCheck(cudaMemcpy(d_adc.get(), h_adc.get(), size16, cudaMemcpyHostToDevice));
0257 
0258     // Launch CUDA Kernels
0259     int threadsPerBlock = (kkk == 5) ? 512 : ((kkk == 3) ? 128 : 256);
0260     int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
0261     std::cout << "CUDA countModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock
0262               << " threads\n";
0263 
0264     cms::cuda::launch(
0265         countModules<Phase1>, {blocksPerGrid, threadsPerBlock}, d_id.get(), d_moduleStart.get(), d_clus.get(), n);
0266 
0267     blocksPerGrid = maxNumModules;  //nModules;
0268 
0269     std::cout << "CUDA findModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock
0270               << " threads\n";
0271     cudaCheck(cudaMemset(d_clusInModule.get(), 0, maxNumModules * sizeof(uint32_t)));
0272 
0273     cms::cuda::launch(findClus<Phase1>,
0274                       {blocksPerGrid, threadsPerBlock},
0275                       d_raw.get(),
0276                       d_id.get(),
0277                       d_x.get(),
0278                       d_y.get(),
0279                       d_moduleStart.get(),
0280                       d_clusInModule.get(),
0281                       d_moduleId.get(),
0282                       d_clus.get(),
0283                       n);
0284     cudaDeviceSynchronize();
0285     cudaCheck(cudaMemcpy(&nModules, d_moduleStart.get(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
0286 
0287     uint32_t nclus[maxNumModules], moduleId[nModules];
0288     cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), maxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
0289 
0290     std::cout << "before charge cut found " << std::accumulate(nclus, nclus + maxNumModules, 0) << " clusters"
0291               << std::endl;
0292     for (auto i = maxNumModules; i > 0; i--)
0293       if (nclus[i - 1] > 0) {
0294         std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
0295         break;
0296       }
0297     if (ncl != std::accumulate(nclus, nclus + maxNumModules, 0))
0298       std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
0299 
0300     cms::cuda::launch(clusterChargeCut<Phase1>,
0301                       {blocksPerGrid, threadsPerBlock},
0302                       clusterThresholds,
0303                       d_id.get(),
0304                       d_adc.get(),
0305                       d_moduleStart.get(),
0306                       d_clusInModule.get(),
0307                       d_moduleId.get(),
0308                       d_clus.get(),
0309                       n);
0310 
0311     cudaDeviceSynchronize();
0312 #else   // __CUDACC__
0313     h_moduleStart[0] = nModules;
0314     countModules<Phase1>(h_id.get(), h_moduleStart.get(), h_clus.get(), n);
0315     memset(h_clusInModule.get(), 0, maxNumModules * sizeof(uint32_t));
0316 
0317     findClus<Phase1>(h_raw.get(),
0318                      h_id.get(),
0319                      h_x.get(),
0320                      h_y.get(),
0321                      h_moduleStart.get(),
0322                      h_clusInModule.get(),
0323                      h_moduleId.get(),
0324                      h_clus.get(),
0325                      n);
0326 
0327     nModules = h_moduleStart[0];
0328     auto nclus = h_clusInModule.get();
0329 
0330     std::cout << "before charge cut found " << std::accumulate(nclus, nclus + maxNumModules, 0) << " clusters"
0331               << std::endl;
0332     for (auto i = maxNumModules; i > 0; i--)
0333       if (nclus[i - 1] > 0) {
0334         std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
0335         break;
0336       }
0337     if (ncl != std::accumulate(nclus, nclus + maxNumModules, 0))
0338       std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
0339 
0340     clusterChargeCut<Phase1>(clusterThresholds,
0341                              h_id.get(),
0342                              h_adc.get(),
0343                              h_moduleStart.get(),
0344                              h_clusInModule.get(),
0345                              h_moduleId.get(),
0346                              h_clus.get(),
0347                              n);
0348 #endif  // __CUDACC__
0349 
0350     std::cout << "found " << nModules << " Modules active" << std::endl;
0351 
0352 #ifdef __CUDACC__
0353     cudaCheck(cudaMemcpy(h_id.get(), d_id.get(), size16, cudaMemcpyDeviceToHost));
0354     cudaCheck(cudaMemcpy(h_clus.get(), d_clus.get(), size32, cudaMemcpyDeviceToHost));
0355     cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), maxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
0356     cudaCheck(cudaMemcpy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
0357 #endif  // __CUDACC__
0358 
0359     std::set<unsigned int> clids;
0360     for (int i = 0; i < n; ++i) {
0361       assert(h_id[i] != 666);  // only noise
0362       if (h_id[i] == invalidModuleId)
0363         continue;
0364       assert(h_clus[i] >= 0);
0365       assert(h_clus[i] < int(nclus[h_id[i]]));
0366       clids.insert(h_id[i] * 1000 + h_clus[i]);
0367       // clids.insert(h_clus[i]);
0368     }
0369 
0370     // verify no hole in numbering
0371     auto p = clids.begin();
0372     auto cmid = (*p) / 1000;
0373     assert(0 == (*p) % 1000);
0374     auto c = p;
0375     ++c;
0376     std::cout << "first clusters " << *p << ' ' << *c << ' ' << nclus[cmid] << ' ' << nclus[(*c) / 1000] << std::endl;
0377     std::cout << "last cluster " << *clids.rbegin() << ' ' << nclus[(*clids.rbegin()) / 1000] << std::endl;
0378     for (; c != clids.end(); ++c) {
0379       auto cc = *c;
0380       auto pp = *p;
0381       auto mid = cc / 1000;
0382       auto pnc = pp % 1000;
0383       auto nc = cc % 1000;
0384       if (mid != cmid) {
0385         assert(0 == cc % 1000);
0386         assert(nclus[cmid] - 1 == pp % 1000);
0387         // if (nclus[cmid]-1 != pp%1000) std::cout << "error size " << mid << ": "  << nclus[mid] << ' ' << pp << std::endl;
0388         cmid = mid;
0389         p = c;
0390         continue;
0391       }
0392       p = c;
0393       // assert(nc==pnc+1);
0394       if (nc != pnc + 1)
0395         std::cout << "error " << mid << ": " << nc << ' ' << pnc << std::endl;
0396     }
0397 
0398     std::cout << "found " << std::accumulate(nclus, nclus + maxNumModules, 0) << ' ' << clids.size() << " clusters"
0399               << std::endl;
0400     for (auto i = maxNumModules; i > 0; i--)
0401       if (nclus[i - 1] > 0) {
0402         std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
0403         break;
0404       }
0405     // << " and " << seeds.size() << " seeds" << std::endl;
0406   }  /// end loop kkk
0407   return 0;
0408 }