Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2022-08-04 22:45:25

0001 #include <algorithm>
0002 #include <cassert>
0003 #include <cmath>
0004 #include <cstdint>
0005 #include <iomanip>
0006 #include <iostream>
0007 #include <memory>
0008 #include <numeric>
0009 #include <set>
0010 #include <vector>
0011 
0012 #ifdef __CUDACC__
0013 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0014 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0015 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
0016 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0017 #endif  // __CUDACC__
0018 
0019 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
0020 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
0021 #include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelClusterThresholds.h"
0022 
0023 int main(void) {
0024 #ifdef __CUDACC__
0025   cms::cudatest::requireDevices();
0026 #endif  // __CUDACC__
0027 
0028   using namespace gpuClustering;
0029 
0030   constexpr int numElements = 256 * maxNumModules;
0031   constexpr SiPixelClusterThresholds clusterThresholds(kSiPixelClusterThresholdsDefaultPhase1);
0032 
0033   // these in reality are already on GPU
0034   auto h_raw = std::make_unique<uint32_t[]>(numElements);
0035   auto h_id = std::make_unique<uint16_t[]>(numElements);
0036   auto h_x = std::make_unique<uint16_t[]>(numElements);
0037   auto h_y = std::make_unique<uint16_t[]>(numElements);
0038   auto h_adc = std::make_unique<uint16_t[]>(numElements);
0039   auto h_clus = std::make_unique<int[]>(numElements);
0040 
0041 #ifdef __CUDACC__
0042   auto d_raw = cms::cuda::make_device_unique<uint32_t[]>(numElements, nullptr);
0043   auto d_id = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
0044   auto d_x = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
0045   auto d_y = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
0046   auto d_adc = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
0047   auto d_clus = cms::cuda::make_device_unique<int[]>(numElements, nullptr);
0048   auto d_moduleStart = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules + 1, nullptr);
0049   auto d_clusInModule = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules, nullptr);
0050   auto d_moduleId = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules, nullptr);
0051 #else   // __CUDACC__
0052   auto h_moduleStart = std::make_unique<uint32_t[]>(maxNumModules + 1);
0053   auto h_clusInModule = std::make_unique<uint32_t[]>(maxNumModules);
0054   auto h_moduleId = std::make_unique<uint32_t[]>(maxNumModules);
0055 #endif  // __CUDACC__
0056 
0057   // later random number
0058   int n = 0;
0059   int ncl = 0;
0060   int y[10] = {5, 7, 9, 1, 3, 0, 4, 8, 2, 6};
0061 
0062   auto generateClusters = [&](int kn) {
0063     auto addBigNoise = 1 == kn % 2;
0064     if (addBigNoise) {
0065       constexpr int MaxPixels = 1000;
0066       int id = 666;
0067       for (int x = 0; x < 140; x += 3) {
0068         for (int yy = 0; yy < 400; yy += 3) {
0069           h_id[n] = id;
0070           h_x[n] = x;
0071           h_y[n] = yy;
0072           h_adc[n] = 1000;
0073           ++n;
0074           ++ncl;
0075           if (MaxPixels <= ncl)
0076             break;
0077         }
0078         if (MaxPixels <= ncl)
0079           break;
0080       }
0081     }
0082 
0083     {
0084       // isolated
0085       int id = 42;
0086       int x = 10;
0087       ++ncl;
0088       h_id[n] = id;
0089       h_x[n] = x;
0090       h_y[n] = x;
0091       h_adc[n] = kn == 0 ? 100 : 5000;
0092       ++n;
0093 
0094       // first column
0095       ++ncl;
0096       h_id[n] = id;
0097       h_x[n] = x;
0098       h_y[n] = 0;
0099       h_adc[n] = 5000;
0100       ++n;
0101       // first columns
0102       ++ncl;
0103       h_id[n] = id;
0104       h_x[n] = x + 80;
0105       h_y[n] = 2;
0106       h_adc[n] = 5000;
0107       ++n;
0108       h_id[n] = id;
0109       h_x[n] = x + 80;
0110       h_y[n] = 1;
0111       h_adc[n] = 5000;
0112       ++n;
0113 
0114       // last column
0115       ++ncl;
0116       h_id[n] = id;
0117       h_x[n] = x;
0118       h_y[n] = 415;
0119       h_adc[n] = 5000;
0120       ++n;
0121       // last columns
0122       ++ncl;
0123       h_id[n] = id;
0124       h_x[n] = x + 80;
0125       h_y[n] = 415;
0126       h_adc[n] = 2500;
0127       ++n;
0128       h_id[n] = id;
0129       h_x[n] = x + 80;
0130       h_y[n] = 414;
0131       h_adc[n] = 2500;
0132       ++n;
0133 
0134       // diagonal
0135       ++ncl;
0136       for (int x = 20; x < 25; ++x) {
0137         h_id[n] = id;
0138         h_x[n] = x;
0139         h_y[n] = x;
0140         h_adc[n] = 1000;
0141         ++n;
0142       }
0143       ++ncl;
0144       // reversed
0145       for (int x = 45; x > 40; --x) {
0146         h_id[n] = id;
0147         h_x[n] = x;
0148         h_y[n] = x;
0149         h_adc[n] = 1000;
0150         ++n;
0151       }
0152       ++ncl;
0153       h_id[n++] = invalidModuleId;  // error
0154       // messy
0155       int xx[5] = {21, 25, 23, 24, 22};
0156       for (int k = 0; k < 5; ++k) {
0157         h_id[n] = id;
0158         h_x[n] = xx[k];
0159         h_y[n] = 20 + xx[k];
0160         h_adc[n] = 1000;
0161         ++n;
0162       }
0163       // holes
0164       ++ncl;
0165       for (int k = 0; k < 5; ++k) {
0166         h_id[n] = id;
0167         h_x[n] = xx[k];
0168         h_y[n] = 100;
0169         h_adc[n] = kn == 2 ? 100 : 1000;
0170         ++n;
0171         if (xx[k] % 2 == 0) {
0172           h_id[n] = id;
0173           h_x[n] = xx[k];
0174           h_y[n] = 101;
0175           h_adc[n] = 1000;
0176           ++n;
0177         }
0178       }
0179     }
0180     {
0181       // id == 0 (make sure it works!
0182       int id = 0;
0183       int x = 10;
0184       ++ncl;
0185       h_id[n] = id;
0186       h_x[n] = x;
0187       h_y[n] = x;
0188       h_adc[n] = 5000;
0189       ++n;
0190     }
0191     // all odd id
0192     for (int id = 11; id <= 1800; id += 2) {
0193       if ((id / 20) % 2)
0194         h_id[n++] = invalidModuleId;  // error
0195       for (int x = 0; x < 40; x += 4) {
0196         ++ncl;
0197         if ((id / 10) % 2) {
0198           for (int k = 0; k < 10; ++k) {
0199             h_id[n] = id;
0200             h_x[n] = x;
0201             h_y[n] = x + y[k];
0202             h_adc[n] = 100;
0203             ++n;
0204             h_id[n] = id;
0205             h_x[n] = x + 1;
0206             h_y[n] = x + y[k] + 2;
0207             h_adc[n] = 1000;
0208             ++n;
0209           }
0210         } else {
0211           for (int k = 0; k < 10; ++k) {
0212             h_id[n] = id;
0213             h_x[n] = x;
0214             h_y[n] = x + y[9 - k];
0215             h_adc[n] = kn == 2 ? 10 : 1000;
0216             ++n;
0217             if (y[k] == 3)
0218               continue;  // hole
0219             if (id == 51) {
0220               h_id[n++] = invalidModuleId;
0221               h_id[n++] = invalidModuleId;
0222             }  // error
0223             h_id[n] = id;
0224             h_x[n] = x + 1;
0225             h_y[n] = x + y[k] + 2;
0226             h_adc[n] = kn == 2 ? 10 : 1000;
0227             ++n;
0228           }
0229         }
0230       }
0231     }
0232   };  // end lambda
0233   for (auto kkk = 0; kkk < 5; ++kkk) {
0234     n = 0;
0235     ncl = 0;
0236     generateClusters(kkk);
0237 
0238     std::cout << "created " << n << " digis in " << ncl << " clusters" << std::endl;
0239     assert(n <= numElements);
0240 
0241     uint32_t nModules = 0;
0242 #ifdef __CUDACC__
0243     size_t size32 = n * sizeof(unsigned int);
0244     size_t size16 = n * sizeof(unsigned short);
0245     // size_t size8 = n * sizeof(uint8_t);
0246 
0247     cudaCheck(cudaMemcpy(d_moduleStart.get(), &nModules, sizeof(uint32_t), cudaMemcpyHostToDevice));
0248     cudaCheck(cudaMemcpy(d_id.get(), h_id.get(), size16, cudaMemcpyHostToDevice));
0249     cudaCheck(cudaMemcpy(d_x.get(), h_x.get(), size16, cudaMemcpyHostToDevice));
0250     cudaCheck(cudaMemcpy(d_y.get(), h_y.get(), size16, cudaMemcpyHostToDevice));
0251     cudaCheck(cudaMemcpy(d_adc.get(), h_adc.get(), size16, cudaMemcpyHostToDevice));
0252 
0253     // Launch CUDA Kernels
0254     int threadsPerBlock = (kkk == 5) ? 512 : ((kkk == 3) ? 128 : 256);
0255     int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
0256     std::cout << "CUDA countModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock
0257               << " threads\n";
0258 
0259     cms::cuda::launch(
0260         countModules<false>, {blocksPerGrid, threadsPerBlock}, d_id.get(), d_moduleStart.get(), d_clus.get(), n);
0261 
0262     blocksPerGrid = maxNumModules;  //nModules;
0263 
0264     std::cout << "CUDA findModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock
0265               << " threads\n";
0266     cudaCheck(cudaMemset(d_clusInModule.get(), 0, maxNumModules * sizeof(uint32_t)));
0267 
0268     cms::cuda::launch(findClus<false>,
0269                       {blocksPerGrid, threadsPerBlock},
0270                       d_raw.get(),
0271                       d_id.get(),
0272                       d_x.get(),
0273                       d_y.get(),
0274                       d_moduleStart.get(),
0275                       d_clusInModule.get(),
0276                       d_moduleId.get(),
0277                       d_clus.get(),
0278                       n);
0279     cudaDeviceSynchronize();
0280     cudaCheck(cudaMemcpy(&nModules, d_moduleStart.get(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
0281 
0282     uint32_t nclus[maxNumModules], moduleId[nModules];
0283     cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), maxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
0284 
0285     std::cout << "before charge cut found " << std::accumulate(nclus, nclus + maxNumModules, 0) << " clusters"
0286               << std::endl;
0287     for (auto i = maxNumModules; i > 0; i--)
0288       if (nclus[i - 1] > 0) {
0289         std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
0290         break;
0291       }
0292     if (ncl != std::accumulate(nclus, nclus + maxNumModules, 0))
0293       std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
0294 
0295     cms::cuda::launch(clusterChargeCut<false>,
0296                       {blocksPerGrid, threadsPerBlock},
0297                       clusterThresholds,
0298                       d_id.get(),
0299                       d_adc.get(),
0300                       d_moduleStart.get(),
0301                       d_clusInModule.get(),
0302                       d_moduleId.get(),
0303                       d_clus.get(),
0304                       n);
0305 
0306     cudaDeviceSynchronize();
0307 #else   // __CUDACC__
0308     h_moduleStart[0] = nModules;
0309     countModules<false>(h_id.get(), h_moduleStart.get(), h_clus.get(), n);
0310     memset(h_clusInModule.get(), 0, maxNumModules * sizeof(uint32_t));
0311     findClus<false>(h_raw.get(),
0312                     h_id.get(),
0313                     h_x.get(),
0314                     h_y.get(),
0315                     h_moduleStart.get(),
0316                     h_clusInModule.get(),
0317                     h_moduleId.get(),
0318                     h_clus.get(),
0319                     n);
0320 
0321     nModules = h_moduleStart[0];
0322     auto nclus = h_clusInModule.get();
0323 
0324     std::cout << "before charge cut found " << std::accumulate(nclus, nclus + maxNumModules, 0) << " clusters"
0325               << std::endl;
0326     for (auto i = maxNumModules; i > 0; i--)
0327       if (nclus[i - 1] > 0) {
0328         std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
0329         break;
0330       }
0331     if (ncl != std::accumulate(nclus, nclus + maxNumModules, 0))
0332       std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
0333 
0334     clusterChargeCut<false>(clusterThresholds,
0335                             h_id.get(),
0336                             h_adc.get(),
0337                             h_moduleStart.get(),
0338                             h_clusInModule.get(),
0339                             h_moduleId.get(),
0340                             h_clus.get(),
0341                             n);
0342 #endif  // __CUDACC__
0343 
0344     std::cout << "found " << nModules << " Modules active" << std::endl;
0345 
0346 #ifdef __CUDACC__
0347     cudaCheck(cudaMemcpy(h_id.get(), d_id.get(), size16, cudaMemcpyDeviceToHost));
0348     cudaCheck(cudaMemcpy(h_clus.get(), d_clus.get(), size32, cudaMemcpyDeviceToHost));
0349     cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), maxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
0350     cudaCheck(cudaMemcpy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
0351 #endif  // __CUDACC__
0352 
0353     std::set<unsigned int> clids;
0354     for (int i = 0; i < n; ++i) {
0355       assert(h_id[i] != 666);  // only noise
0356       if (h_id[i] == invalidModuleId)
0357         continue;
0358       assert(h_clus[i] >= 0);
0359       assert(h_clus[i] < int(nclus[h_id[i]]));
0360       clids.insert(h_id[i] * 1000 + h_clus[i]);
0361       // clids.insert(h_clus[i]);
0362     }
0363 
0364     // verify no hole in numbering
0365     auto p = clids.begin();
0366     auto cmid = (*p) / 1000;
0367     assert(0 == (*p) % 1000);
0368     auto c = p;
0369     ++c;
0370     std::cout << "first clusters " << *p << ' ' << *c << ' ' << nclus[cmid] << ' ' << nclus[(*c) / 1000] << std::endl;
0371     std::cout << "last cluster " << *clids.rbegin() << ' ' << nclus[(*clids.rbegin()) / 1000] << std::endl;
0372     for (; c != clids.end(); ++c) {
0373       auto cc = *c;
0374       auto pp = *p;
0375       auto mid = cc / 1000;
0376       auto pnc = pp % 1000;
0377       auto nc = cc % 1000;
0378       if (mid != cmid) {
0379         assert(0 == cc % 1000);
0380         assert(nclus[cmid] - 1 == pp % 1000);
0381         // if (nclus[cmid]-1 != pp%1000) std::cout << "error size " << mid << ": "  << nclus[mid] << ' ' << pp << std::endl;
0382         cmid = mid;
0383         p = c;
0384         continue;
0385       }
0386       p = c;
0387       // assert(nc==pnc+1);
0388       if (nc != pnc + 1)
0389         std::cout << "error " << mid << ": " << nc << ' ' << pnc << std::endl;
0390     }
0391 
0392     std::cout << "found " << std::accumulate(nclus, nclus + maxNumModules, 0) << ' ' << clids.size() << " clusters"
0393               << std::endl;
0394     for (auto i = maxNumModules; i > 0; i--)
0395       if (nclus[i - 1] > 0) {
0396         std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
0397         break;
0398       }
0399     // << " and " << seeds.size() << " seeds" << std::endl;
0400   }  /// end loop kkk
0401   return 0;
0402 }