File indexing completed on 2024-04-06 12:26:20
0001 #include <algorithm>
0002 #include <cassert>
0003 #include <cmath>
0004 #include <cstdint>
0005 #include <iomanip>
0006 #include <iostream>
0007 #include <memory>
0008 #include <numeric>
0009 #include <set>
0010 #include <vector>
0011
0012 #ifdef __CUDACC__
0013 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0014 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0015 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
0016 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0017 #endif
0018
0019 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
0020 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
0021 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelClusterThresholds.h"
0022
0023
0024 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
0025 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
0026
0027 int main(void) {
0028 #ifdef __CUDACC__
0029 cms::cudatest::requireDevices();
0030 #endif
0031
0032 using namespace gpuClustering;
0033 using pixelTopology::Phase1;
0034
0035 constexpr int numElements = 256 * maxNumModules;
0036 const SiPixelClusterThresholds clusterThresholds(
0037 clusterThresholdLayerOne, clusterThresholdOtherLayers, 0.f, 0.f, 0.f, 0.f);
0038
0039
0040 auto h_raw = std::make_unique<uint32_t[]>(numElements);
0041 auto h_id = std::make_unique<uint16_t[]>(numElements);
0042 auto h_x = std::make_unique<uint16_t[]>(numElements);
0043 auto h_y = std::make_unique<uint16_t[]>(numElements);
0044 auto h_adc = std::make_unique<uint16_t[]>(numElements);
0045 auto h_clus = std::make_unique<int[]>(numElements);
0046
0047 #ifdef __CUDACC__
0048 auto d_raw = cms::cuda::make_device_unique<uint32_t[]>(numElements, nullptr);
0049 auto d_id = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
0050 auto d_x = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
0051 auto d_y = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
0052 auto d_adc = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
0053 auto d_clus = cms::cuda::make_device_unique<int[]>(numElements, nullptr);
0054 auto d_moduleStart = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules + 1, nullptr);
0055 auto d_clusInModule = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules, nullptr);
0056 auto d_moduleId = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules, nullptr);
0057 #else
0058 auto h_moduleStart = std::make_unique<uint32_t[]>(maxNumModules + 1);
0059 auto h_clusInModule = std::make_unique<uint32_t[]>(maxNumModules);
0060 auto h_moduleId = std::make_unique<uint32_t[]>(maxNumModules);
0061 #endif
0062
0063
0064 int n = 0;
0065 int ncl = 0;
0066 int y[10] = {5, 7, 9, 1, 3, 0, 4, 8, 2, 6};
0067
0068 auto generateClusters = [&](int kn) {
0069 auto addBigNoise = 1 == kn % 2;
0070 if (addBigNoise) {
0071 constexpr int MaxPixels = 1000;
0072 int id = 666;
0073 for (int x = 0; x < 140; x += 3) {
0074 for (int yy = 0; yy < 400; yy += 3) {
0075 h_id[n] = id;
0076 h_x[n] = x;
0077 h_y[n] = yy;
0078 h_adc[n] = 1000;
0079 ++n;
0080 ++ncl;
0081 if (MaxPixels <= ncl)
0082 break;
0083 }
0084 if (MaxPixels <= ncl)
0085 break;
0086 }
0087 }
0088
0089 {
0090
0091 int id = 42;
0092 int x = 10;
0093 ++ncl;
0094 h_id[n] = id;
0095 h_x[n] = x;
0096 h_y[n] = x;
0097 h_adc[n] = kn == 0 ? 100 : 5000;
0098 ++n;
0099
0100
0101 ++ncl;
0102 h_id[n] = id;
0103 h_x[n] = x;
0104 h_y[n] = 0;
0105 h_adc[n] = 5000;
0106 ++n;
0107
0108 ++ncl;
0109 h_id[n] = id;
0110 h_x[n] = x + 80;
0111 h_y[n] = 2;
0112 h_adc[n] = 5000;
0113 ++n;
0114 h_id[n] = id;
0115 h_x[n] = x + 80;
0116 h_y[n] = 1;
0117 h_adc[n] = 5000;
0118 ++n;
0119
0120
0121 ++ncl;
0122 h_id[n] = id;
0123 h_x[n] = x;
0124 h_y[n] = 415;
0125 h_adc[n] = 5000;
0126 ++n;
0127
0128 ++ncl;
0129 h_id[n] = id;
0130 h_x[n] = x + 80;
0131 h_y[n] = 415;
0132 h_adc[n] = 2500;
0133 ++n;
0134 h_id[n] = id;
0135 h_x[n] = x + 80;
0136 h_y[n] = 414;
0137 h_adc[n] = 2500;
0138 ++n;
0139
0140
0141 ++ncl;
0142 for (int x = 20; x < 25; ++x) {
0143 h_id[n] = id;
0144 h_x[n] = x;
0145 h_y[n] = x;
0146 h_adc[n] = 1000;
0147 ++n;
0148 }
0149 ++ncl;
0150
0151 for (int x = 45; x > 40; --x) {
0152 h_id[n] = id;
0153 h_x[n] = x;
0154 h_y[n] = x;
0155 h_adc[n] = 1000;
0156 ++n;
0157 }
0158 ++ncl;
0159 h_id[n++] = invalidModuleId;
0160
0161 int xx[5] = {21, 25, 23, 24, 22};
0162 for (int k = 0; k < 5; ++k) {
0163 h_id[n] = id;
0164 h_x[n] = xx[k];
0165 h_y[n] = 20 + xx[k];
0166 h_adc[n] = 1000;
0167 ++n;
0168 }
0169
0170 ++ncl;
0171 for (int k = 0; k < 5; ++k) {
0172 h_id[n] = id;
0173 h_x[n] = xx[k];
0174 h_y[n] = 100;
0175 h_adc[n] = kn == 2 ? 100 : 1000;
0176 ++n;
0177 if (xx[k] % 2 == 0) {
0178 h_id[n] = id;
0179 h_x[n] = xx[k];
0180 h_y[n] = 101;
0181 h_adc[n] = 1000;
0182 ++n;
0183 }
0184 }
0185 }
0186 {
0187
0188 int id = 0;
0189 int x = 10;
0190 ++ncl;
0191 h_id[n] = id;
0192 h_x[n] = x;
0193 h_y[n] = x;
0194 h_adc[n] = 5000;
0195 ++n;
0196 }
0197
0198 for (int id = 11; id <= 1800; id += 2) {
0199 if ((id / 20) % 2)
0200 h_id[n++] = invalidModuleId;
0201 for (int x = 0; x < 40; x += 4) {
0202 ++ncl;
0203 if ((id / 10) % 2) {
0204 for (int k = 0; k < 10; ++k) {
0205 h_id[n] = id;
0206 h_x[n] = x;
0207 h_y[n] = x + y[k];
0208 h_adc[n] = 100;
0209 ++n;
0210 h_id[n] = id;
0211 h_x[n] = x + 1;
0212 h_y[n] = x + y[k] + 2;
0213 h_adc[n] = 1000;
0214 ++n;
0215 }
0216 } else {
0217 for (int k = 0; k < 10; ++k) {
0218 h_id[n] = id;
0219 h_x[n] = x;
0220 h_y[n] = x + y[9 - k];
0221 h_adc[n] = kn == 2 ? 10 : 1000;
0222 ++n;
0223 if (y[k] == 3)
0224 continue;
0225 if (id == 51) {
0226 h_id[n++] = invalidModuleId;
0227 h_id[n++] = invalidModuleId;
0228 }
0229 h_id[n] = id;
0230 h_x[n] = x + 1;
0231 h_y[n] = x + y[k] + 2;
0232 h_adc[n] = kn == 2 ? 10 : 1000;
0233 ++n;
0234 }
0235 }
0236 }
0237 }
0238 };
0239 for (auto kkk = 0; kkk < 5; ++kkk) {
0240 n = 0;
0241 ncl = 0;
0242 generateClusters(kkk);
0243
0244 std::cout << "created " << n << " digis in " << ncl << " clusters" << std::endl;
0245 assert(n <= numElements);
0246
0247 uint32_t nModules = 0;
0248 #ifdef __CUDACC__
0249 size_t size32 = n * sizeof(unsigned int);
0250 size_t size16 = n * sizeof(unsigned short);
0251
0252
0253 cudaCheck(cudaMemcpy(d_moduleStart.get(), &nModules, sizeof(uint32_t), cudaMemcpyHostToDevice));
0254 cudaCheck(cudaMemcpy(d_id.get(), h_id.get(), size16, cudaMemcpyHostToDevice));
0255 cudaCheck(cudaMemcpy(d_x.get(), h_x.get(), size16, cudaMemcpyHostToDevice));
0256 cudaCheck(cudaMemcpy(d_y.get(), h_y.get(), size16, cudaMemcpyHostToDevice));
0257 cudaCheck(cudaMemcpy(d_adc.get(), h_adc.get(), size16, cudaMemcpyHostToDevice));
0258
0259
0260 int threadsPerBlock = (kkk == 5) ? 512 : ((kkk == 3) ? 128 : 256);
0261 int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
0262 std::cout << "CUDA countModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock
0263 << " threads\n";
0264
0265 cms::cuda::launch(
0266 countModules<Phase1>, {blocksPerGrid, threadsPerBlock}, d_id.get(), d_moduleStart.get(), d_clus.get(), n);
0267
0268 blocksPerGrid = maxNumModules;
0269
0270 std::cout << "CUDA findModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock
0271 << " threads\n";
0272 cudaCheck(cudaMemset(d_clusInModule.get(), 0, maxNumModules * sizeof(uint32_t)));
0273
0274 cms::cuda::launch(findClus<Phase1>,
0275 {blocksPerGrid, threadsPerBlock},
0276 d_raw.get(),
0277 d_id.get(),
0278 d_x.get(),
0279 d_y.get(),
0280 d_moduleStart.get(),
0281 d_clusInModule.get(),
0282 d_moduleId.get(),
0283 d_clus.get(),
0284 n);
0285 cudaDeviceSynchronize();
0286 cudaCheck(cudaMemcpy(&nModules, d_moduleStart.get(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
0287
0288 uint32_t nclus[maxNumModules], moduleId[nModules];
0289 cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), maxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
0290
0291 std::cout << "before charge cut found " << std::accumulate(nclus, nclus + maxNumModules, 0) << " clusters"
0292 << std::endl;
0293 for (auto i = maxNumModules; i > 0; i--)
0294 if (nclus[i - 1] > 0) {
0295 std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
0296 break;
0297 }
0298 if (ncl != std::accumulate(nclus, nclus + maxNumModules, 0))
0299 std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
0300
0301 cms::cuda::launch(clusterChargeCut<Phase1>,
0302 {blocksPerGrid, threadsPerBlock},
0303 clusterThresholds,
0304 d_id.get(),
0305 d_adc.get(),
0306 d_moduleStart.get(),
0307 d_clusInModule.get(),
0308 d_moduleId.get(),
0309 d_clus.get(),
0310 n);
0311
0312 cudaDeviceSynchronize();
0313 #else
0314 h_moduleStart[0] = nModules;
0315 countModules<Phase1>(h_id.get(), h_moduleStart.get(), h_clus.get(), n);
0316 memset(h_clusInModule.get(), 0, maxNumModules * sizeof(uint32_t));
0317
0318 findClus<Phase1>(h_raw.get(),
0319 h_id.get(),
0320 h_x.get(),
0321 h_y.get(),
0322 h_moduleStart.get(),
0323 h_clusInModule.get(),
0324 h_moduleId.get(),
0325 h_clus.get(),
0326 n);
0327
0328 nModules = h_moduleStart[0];
0329 auto nclus = h_clusInModule.get();
0330
0331 std::cout << "before charge cut found " << std::accumulate(nclus, nclus + maxNumModules, 0) << " clusters"
0332 << std::endl;
0333 for (auto i = maxNumModules; i > 0; i--)
0334 if (nclus[i - 1] > 0) {
0335 std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
0336 break;
0337 }
0338 if (ncl != std::accumulate(nclus, nclus + maxNumModules, 0))
0339 std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
0340
0341 clusterChargeCut<Phase1>(clusterThresholds,
0342 h_id.get(),
0343 h_adc.get(),
0344 h_moduleStart.get(),
0345 h_clusInModule.get(),
0346 h_moduleId.get(),
0347 h_clus.get(),
0348 n);
0349 #endif
0350
0351 std::cout << "found " << nModules << " Modules active" << std::endl;
0352
0353 #ifdef __CUDACC__
0354 cudaCheck(cudaMemcpy(h_id.get(), d_id.get(), size16, cudaMemcpyDeviceToHost));
0355 cudaCheck(cudaMemcpy(h_clus.get(), d_clus.get(), size32, cudaMemcpyDeviceToHost));
0356 cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), maxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
0357 cudaCheck(cudaMemcpy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
0358 #endif
0359
0360 std::set<unsigned int> clids;
0361 for (int i = 0; i < n; ++i) {
0362 assert(h_id[i] != 666);
0363 if (h_id[i] == invalidModuleId)
0364 continue;
0365 assert(h_clus[i] >= 0);
0366 assert(h_clus[i] < int(nclus[h_id[i]]));
0367 clids.insert(h_id[i] * 1000 + h_clus[i]);
0368
0369 }
0370
0371
0372 auto p = clids.begin();
0373 auto cmid = (*p) / 1000;
0374 assert(0 == (*p) % 1000);
0375 auto c = p;
0376 ++c;
0377 std::cout << "first clusters " << *p << ' ' << *c << ' ' << nclus[cmid] << ' ' << nclus[(*c) / 1000] << std::endl;
0378 std::cout << "last cluster " << *clids.rbegin() << ' ' << nclus[(*clids.rbegin()) / 1000] << std::endl;
0379 for (; c != clids.end(); ++c) {
0380 auto cc = *c;
0381 auto pp = *p;
0382 auto mid = cc / 1000;
0383 auto pnc = pp % 1000;
0384 auto nc = cc % 1000;
0385 if (mid != cmid) {
0386 assert(0 == cc % 1000);
0387 assert(nclus[cmid] - 1 == pp % 1000);
0388
0389 cmid = mid;
0390 p = c;
0391 continue;
0392 }
0393 p = c;
0394
0395 if (nc != pnc + 1)
0396 std::cout << "error " << mid << ": " << nc << ' ' << pnc << std::endl;
0397 }
0398
0399 std::cout << "found " << std::accumulate(nclus, nclus + maxNumModules, 0) << ' ' << clids.size() << " clusters"
0400 << std::endl;
0401 for (auto i = maxNumModules; i > 0; i--)
0402 if (nclus[i - 1] > 0) {
0403 std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
0404 break;
0405 }
0406
0407 }
0408 return 0;
0409 }