Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 12:26:20

0001 #include <algorithm>
0002 #include <cassert>
0003 #include <cmath>
0004 #include <cstdint>
0005 #include <iomanip>
0006 #include <iostream>
0007 #include <memory>
0008 #include <numeric>
0009 #include <set>
0010 #include <vector>
0011 
0012 #ifdef __CUDACC__
0013 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0014 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0015 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
0016 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0017 #endif  // __CUDACC__
0018 
0019 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
0020 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
0021 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelClusterThresholds.h"
0022 
0023 // local includes, for testing only
0024 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
0025 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
0026 
0027 int main(void) {
0028 #ifdef __CUDACC__
0029   cms::cudatest::requireDevices();
0030 #endif  // __CUDACC__
0031 
0032   using namespace gpuClustering;
0033   using pixelTopology::Phase1;
0034 
0035   constexpr int numElements = 256 * maxNumModules;
0036   const SiPixelClusterThresholds clusterThresholds(
0037       clusterThresholdLayerOne, clusterThresholdOtherLayers, 0.f, 0.f, 0.f, 0.f);
0038 
0039   // these in reality are already on GPU
0040   auto h_raw = std::make_unique<uint32_t[]>(numElements);
0041   auto h_id = std::make_unique<uint16_t[]>(numElements);
0042   auto h_x = std::make_unique<uint16_t[]>(numElements);
0043   auto h_y = std::make_unique<uint16_t[]>(numElements);
0044   auto h_adc = std::make_unique<uint16_t[]>(numElements);
0045   auto h_clus = std::make_unique<int[]>(numElements);
0046 
0047 #ifdef __CUDACC__
0048   auto d_raw = cms::cuda::make_device_unique<uint32_t[]>(numElements, nullptr);
0049   auto d_id = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
0050   auto d_x = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
0051   auto d_y = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
0052   auto d_adc = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
0053   auto d_clus = cms::cuda::make_device_unique<int[]>(numElements, nullptr);
0054   auto d_moduleStart = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules + 1, nullptr);
0055   auto d_clusInModule = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules, nullptr);
0056   auto d_moduleId = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules, nullptr);
0057 #else   // __CUDACC__
0058   auto h_moduleStart = std::make_unique<uint32_t[]>(maxNumModules + 1);
0059   auto h_clusInModule = std::make_unique<uint32_t[]>(maxNumModules);
0060   auto h_moduleId = std::make_unique<uint32_t[]>(maxNumModules);
0061 #endif  // __CUDACC__
0062 
0063   // later random number
0064   int n = 0;
0065   int ncl = 0;
0066   int y[10] = {5, 7, 9, 1, 3, 0, 4, 8, 2, 6};
0067 
0068   auto generateClusters = [&](int kn) {
0069     auto addBigNoise = 1 == kn % 2;
0070     if (addBigNoise) {
0071       constexpr int MaxPixels = 1000;
0072       int id = 666;
0073       for (int x = 0; x < 140; x += 3) {
0074         for (int yy = 0; yy < 400; yy += 3) {
0075           h_id[n] = id;
0076           h_x[n] = x;
0077           h_y[n] = yy;
0078           h_adc[n] = 1000;
0079           ++n;
0080           ++ncl;
0081           if (MaxPixels <= ncl)
0082             break;
0083         }
0084         if (MaxPixels <= ncl)
0085           break;
0086       }
0087     }
0088 
0089     {
0090       // isolated
0091       int id = 42;
0092       int x = 10;
0093       ++ncl;
0094       h_id[n] = id;
0095       h_x[n] = x;
0096       h_y[n] = x;
0097       h_adc[n] = kn == 0 ? 100 : 5000;
0098       ++n;
0099 
0100       // first column
0101       ++ncl;
0102       h_id[n] = id;
0103       h_x[n] = x;
0104       h_y[n] = 0;
0105       h_adc[n] = 5000;
0106       ++n;
0107       // first columns
0108       ++ncl;
0109       h_id[n] = id;
0110       h_x[n] = x + 80;
0111       h_y[n] = 2;
0112       h_adc[n] = 5000;
0113       ++n;
0114       h_id[n] = id;
0115       h_x[n] = x + 80;
0116       h_y[n] = 1;
0117       h_adc[n] = 5000;
0118       ++n;
0119 
0120       // last column
0121       ++ncl;
0122       h_id[n] = id;
0123       h_x[n] = x;
0124       h_y[n] = 415;
0125       h_adc[n] = 5000;
0126       ++n;
0127       // last columns
0128       ++ncl;
0129       h_id[n] = id;
0130       h_x[n] = x + 80;
0131       h_y[n] = 415;
0132       h_adc[n] = 2500;
0133       ++n;
0134       h_id[n] = id;
0135       h_x[n] = x + 80;
0136       h_y[n] = 414;
0137       h_adc[n] = 2500;
0138       ++n;
0139 
0140       // diagonal
0141       ++ncl;
0142       for (int x = 20; x < 25; ++x) {
0143         h_id[n] = id;
0144         h_x[n] = x;
0145         h_y[n] = x;
0146         h_adc[n] = 1000;
0147         ++n;
0148       }
0149       ++ncl;
0150       // reversed
0151       for (int x = 45; x > 40; --x) {
0152         h_id[n] = id;
0153         h_x[n] = x;
0154         h_y[n] = x;
0155         h_adc[n] = 1000;
0156         ++n;
0157       }
0158       ++ncl;
0159       h_id[n++] = invalidModuleId;  // error
0160       // messy
0161       int xx[5] = {21, 25, 23, 24, 22};
0162       for (int k = 0; k < 5; ++k) {
0163         h_id[n] = id;
0164         h_x[n] = xx[k];
0165         h_y[n] = 20 + xx[k];
0166         h_adc[n] = 1000;
0167         ++n;
0168       }
0169       // holes
0170       ++ncl;
0171       for (int k = 0; k < 5; ++k) {
0172         h_id[n] = id;
0173         h_x[n] = xx[k];
0174         h_y[n] = 100;
0175         h_adc[n] = kn == 2 ? 100 : 1000;
0176         ++n;
0177         if (xx[k] % 2 == 0) {
0178           h_id[n] = id;
0179           h_x[n] = xx[k];
0180           h_y[n] = 101;
0181           h_adc[n] = 1000;
0182           ++n;
0183         }
0184       }
0185     }
0186     {
0187       // id == 0 (make sure it works!
0188       int id = 0;
0189       int x = 10;
0190       ++ncl;
0191       h_id[n] = id;
0192       h_x[n] = x;
0193       h_y[n] = x;
0194       h_adc[n] = 5000;
0195       ++n;
0196     }
0197     // all odd id
0198     for (int id = 11; id <= 1800; id += 2) {
0199       if ((id / 20) % 2)
0200         h_id[n++] = invalidModuleId;  // error
0201       for (int x = 0; x < 40; x += 4) {
0202         ++ncl;
0203         if ((id / 10) % 2) {
0204           for (int k = 0; k < 10; ++k) {
0205             h_id[n] = id;
0206             h_x[n] = x;
0207             h_y[n] = x + y[k];
0208             h_adc[n] = 100;
0209             ++n;
0210             h_id[n] = id;
0211             h_x[n] = x + 1;
0212             h_y[n] = x + y[k] + 2;
0213             h_adc[n] = 1000;
0214             ++n;
0215           }
0216         } else {
0217           for (int k = 0; k < 10; ++k) {
0218             h_id[n] = id;
0219             h_x[n] = x;
0220             h_y[n] = x + y[9 - k];
0221             h_adc[n] = kn == 2 ? 10 : 1000;
0222             ++n;
0223             if (y[k] == 3)
0224               continue;  // hole
0225             if (id == 51) {
0226               h_id[n++] = invalidModuleId;
0227               h_id[n++] = invalidModuleId;
0228             }  // error
0229             h_id[n] = id;
0230             h_x[n] = x + 1;
0231             h_y[n] = x + y[k] + 2;
0232             h_adc[n] = kn == 2 ? 10 : 1000;
0233             ++n;
0234           }
0235         }
0236       }
0237     }
0238   };  // end lambda
0239   for (auto kkk = 0; kkk < 5; ++kkk) {
0240     n = 0;
0241     ncl = 0;
0242     generateClusters(kkk);
0243 
0244     std::cout << "created " << n << " digis in " << ncl << " clusters" << std::endl;
0245     assert(n <= numElements);
0246 
0247     uint32_t nModules = 0;
0248 #ifdef __CUDACC__
0249     size_t size32 = n * sizeof(unsigned int);
0250     size_t size16 = n * sizeof(unsigned short);
0251     // size_t size8 = n * sizeof(uint8_t);
0252 
0253     cudaCheck(cudaMemcpy(d_moduleStart.get(), &nModules, sizeof(uint32_t), cudaMemcpyHostToDevice));
0254     cudaCheck(cudaMemcpy(d_id.get(), h_id.get(), size16, cudaMemcpyHostToDevice));
0255     cudaCheck(cudaMemcpy(d_x.get(), h_x.get(), size16, cudaMemcpyHostToDevice));
0256     cudaCheck(cudaMemcpy(d_y.get(), h_y.get(), size16, cudaMemcpyHostToDevice));
0257     cudaCheck(cudaMemcpy(d_adc.get(), h_adc.get(), size16, cudaMemcpyHostToDevice));
0258 
0259     // Launch CUDA Kernels
0260     int threadsPerBlock = (kkk == 5) ? 512 : ((kkk == 3) ? 128 : 256);
0261     int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
0262     std::cout << "CUDA countModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock
0263               << " threads\n";
0264 
0265     cms::cuda::launch(
0266         countModules<Phase1>, {blocksPerGrid, threadsPerBlock}, d_id.get(), d_moduleStart.get(), d_clus.get(), n);
0267 
0268     blocksPerGrid = maxNumModules;  //nModules;
0269 
0270     std::cout << "CUDA findModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock
0271               << " threads\n";
0272     cudaCheck(cudaMemset(d_clusInModule.get(), 0, maxNumModules * sizeof(uint32_t)));
0273 
0274     cms::cuda::launch(findClus<Phase1>,
0275                       {blocksPerGrid, threadsPerBlock},
0276                       d_raw.get(),
0277                       d_id.get(),
0278                       d_x.get(),
0279                       d_y.get(),
0280                       d_moduleStart.get(),
0281                       d_clusInModule.get(),
0282                       d_moduleId.get(),
0283                       d_clus.get(),
0284                       n);
0285     cudaDeviceSynchronize();
0286     cudaCheck(cudaMemcpy(&nModules, d_moduleStart.get(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
0287 
0288     uint32_t nclus[maxNumModules], moduleId[nModules];
0289     cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), maxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
0290 
0291     std::cout << "before charge cut found " << std::accumulate(nclus, nclus + maxNumModules, 0) << " clusters"
0292               << std::endl;
0293     for (auto i = maxNumModules; i > 0; i--)
0294       if (nclus[i - 1] > 0) {
0295         std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
0296         break;
0297       }
0298     if (ncl != std::accumulate(nclus, nclus + maxNumModules, 0))
0299       std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
0300 
0301     cms::cuda::launch(clusterChargeCut<Phase1>,
0302                       {blocksPerGrid, threadsPerBlock},
0303                       clusterThresholds,
0304                       d_id.get(),
0305                       d_adc.get(),
0306                       d_moduleStart.get(),
0307                       d_clusInModule.get(),
0308                       d_moduleId.get(),
0309                       d_clus.get(),
0310                       n);
0311 
0312     cudaDeviceSynchronize();
0313 #else   // __CUDACC__
0314     h_moduleStart[0] = nModules;
0315     countModules<Phase1>(h_id.get(), h_moduleStart.get(), h_clus.get(), n);
0316     memset(h_clusInModule.get(), 0, maxNumModules * sizeof(uint32_t));
0317 
0318     findClus<Phase1>(h_raw.get(),
0319                      h_id.get(),
0320                      h_x.get(),
0321                      h_y.get(),
0322                      h_moduleStart.get(),
0323                      h_clusInModule.get(),
0324                      h_moduleId.get(),
0325                      h_clus.get(),
0326                      n);
0327 
0328     nModules = h_moduleStart[0];
0329     auto nclus = h_clusInModule.get();
0330 
0331     std::cout << "before charge cut found " << std::accumulate(nclus, nclus + maxNumModules, 0) << " clusters"
0332               << std::endl;
0333     for (auto i = maxNumModules; i > 0; i--)
0334       if (nclus[i - 1] > 0) {
0335         std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
0336         break;
0337       }
0338     if (ncl != std::accumulate(nclus, nclus + maxNumModules, 0))
0339       std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
0340 
0341     clusterChargeCut<Phase1>(clusterThresholds,
0342                              h_id.get(),
0343                              h_adc.get(),
0344                              h_moduleStart.get(),
0345                              h_clusInModule.get(),
0346                              h_moduleId.get(),
0347                              h_clus.get(),
0348                              n);
0349 #endif  // __CUDACC__
0350 
0351     std::cout << "found " << nModules << " Modules active" << std::endl;
0352 
0353 #ifdef __CUDACC__
0354     cudaCheck(cudaMemcpy(h_id.get(), d_id.get(), size16, cudaMemcpyDeviceToHost));
0355     cudaCheck(cudaMemcpy(h_clus.get(), d_clus.get(), size32, cudaMemcpyDeviceToHost));
0356     cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), maxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
0357     cudaCheck(cudaMemcpy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
0358 #endif  // __CUDACC__
0359 
0360     std::set<unsigned int> clids;
0361     for (int i = 0; i < n; ++i) {
0362       assert(h_id[i] != 666);  // only noise
0363       if (h_id[i] == invalidModuleId)
0364         continue;
0365       assert(h_clus[i] >= 0);
0366       assert(h_clus[i] < int(nclus[h_id[i]]));
0367       clids.insert(h_id[i] * 1000 + h_clus[i]);
0368       // clids.insert(h_clus[i]);
0369     }
0370 
0371     // verify no hole in numbering
0372     auto p = clids.begin();
0373     auto cmid = (*p) / 1000;
0374     assert(0 == (*p) % 1000);
0375     auto c = p;
0376     ++c;
0377     std::cout << "first clusters " << *p << ' ' << *c << ' ' << nclus[cmid] << ' ' << nclus[(*c) / 1000] << std::endl;
0378     std::cout << "last cluster " << *clids.rbegin() << ' ' << nclus[(*clids.rbegin()) / 1000] << std::endl;
0379     for (; c != clids.end(); ++c) {
0380       auto cc = *c;
0381       auto pp = *p;
0382       auto mid = cc / 1000;
0383       auto pnc = pp % 1000;
0384       auto nc = cc % 1000;
0385       if (mid != cmid) {
0386         assert(0 == cc % 1000);
0387         assert(nclus[cmid] - 1 == pp % 1000);
0388         // if (nclus[cmid]-1 != pp%1000) std::cout << "error size " << mid << ": "  << nclus[mid] << ' ' << pp << std::endl;
0389         cmid = mid;
0390         p = c;
0391         continue;
0392       }
0393       p = c;
0394       // assert(nc==pnc+1);
0395       if (nc != pnc + 1)
0396         std::cout << "error " << mid << ": " << nc << ' ' << pnc << std::endl;
0397     }
0398 
0399     std::cout << "found " << std::accumulate(nclus, nclus + maxNumModules, 0) << ' ' << clids.size() << " clusters"
0400               << std::endl;
0401     for (auto i = maxNumModules; i > 0; i--)
0402       if (nclus[i - 1] > 0) {
0403         std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
0404         break;
0405       }
0406     // << " and " << seeds.size() << " seeds" << std::endl;
0407   }  /// end loop kkk
0408   return 0;
0409 }