File indexing completed on 2023-07-17 02:54:08
0001 #include <algorithm>
0002 #include <cassert>
0003 #include <cmath>
0004 #include <cstdint>
0005 #include <iomanip>
0006 #include <iostream>
0007 #include <memory>
0008 #include <numeric>
0009 #include <set>
0010 #include <vector>
0011
0012 #ifdef __CUDACC__
0013 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0014 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0015 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
0016 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0017 #endif
0018
0019 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
0020 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
0021 #include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelClusterThresholds.h"
0022 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
0023
0024 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
0025
0026 int main(void) {
0027 #ifdef __CUDACC__
0028 cms::cudatest::requireDevices();
0029 #endif
0030
0031 using namespace gpuClustering;
0032 using pixelTopology::Phase1;
0033
0034 constexpr int numElements = 256 * maxNumModules;
0035 const SiPixelClusterThresholds clusterThresholds(
0036 clusterThresholdLayerOne, clusterThresholdOtherLayers, 0.f, 0.f, 0.f, 0.f);
0037
0038
0039 auto h_raw = std::make_unique<uint32_t[]>(numElements);
0040 auto h_id = std::make_unique<uint16_t[]>(numElements);
0041 auto h_x = std::make_unique<uint16_t[]>(numElements);
0042 auto h_y = std::make_unique<uint16_t[]>(numElements);
0043 auto h_adc = std::make_unique<uint16_t[]>(numElements);
0044 auto h_clus = std::make_unique<int[]>(numElements);
0045
0046 #ifdef __CUDACC__
0047 auto d_raw = cms::cuda::make_device_unique<uint32_t[]>(numElements, nullptr);
0048 auto d_id = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
0049 auto d_x = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
0050 auto d_y = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
0051 auto d_adc = cms::cuda::make_device_unique<uint16_t[]>(numElements, nullptr);
0052 auto d_clus = cms::cuda::make_device_unique<int[]>(numElements, nullptr);
0053 auto d_moduleStart = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules + 1, nullptr);
0054 auto d_clusInModule = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules, nullptr);
0055 auto d_moduleId = cms::cuda::make_device_unique<uint32_t[]>(maxNumModules, nullptr);
0056 #else
0057 auto h_moduleStart = std::make_unique<uint32_t[]>(maxNumModules + 1);
0058 auto h_clusInModule = std::make_unique<uint32_t[]>(maxNumModules);
0059 auto h_moduleId = std::make_unique<uint32_t[]>(maxNumModules);
0060 #endif
0061
0062
0063 int n = 0;
0064 int ncl = 0;
0065 int y[10] = {5, 7, 9, 1, 3, 0, 4, 8, 2, 6};
0066
0067 auto generateClusters = [&](int kn) {
0068 auto addBigNoise = 1 == kn % 2;
0069 if (addBigNoise) {
0070 constexpr int MaxPixels = 1000;
0071 int id = 666;
0072 for (int x = 0; x < 140; x += 3) {
0073 for (int yy = 0; yy < 400; yy += 3) {
0074 h_id[n] = id;
0075 h_x[n] = x;
0076 h_y[n] = yy;
0077 h_adc[n] = 1000;
0078 ++n;
0079 ++ncl;
0080 if (MaxPixels <= ncl)
0081 break;
0082 }
0083 if (MaxPixels <= ncl)
0084 break;
0085 }
0086 }
0087
0088 {
0089
0090 int id = 42;
0091 int x = 10;
0092 ++ncl;
0093 h_id[n] = id;
0094 h_x[n] = x;
0095 h_y[n] = x;
0096 h_adc[n] = kn == 0 ? 100 : 5000;
0097 ++n;
0098
0099
0100 ++ncl;
0101 h_id[n] = id;
0102 h_x[n] = x;
0103 h_y[n] = 0;
0104 h_adc[n] = 5000;
0105 ++n;
0106
0107 ++ncl;
0108 h_id[n] = id;
0109 h_x[n] = x + 80;
0110 h_y[n] = 2;
0111 h_adc[n] = 5000;
0112 ++n;
0113 h_id[n] = id;
0114 h_x[n] = x + 80;
0115 h_y[n] = 1;
0116 h_adc[n] = 5000;
0117 ++n;
0118
0119
0120 ++ncl;
0121 h_id[n] = id;
0122 h_x[n] = x;
0123 h_y[n] = 415;
0124 h_adc[n] = 5000;
0125 ++n;
0126
0127 ++ncl;
0128 h_id[n] = id;
0129 h_x[n] = x + 80;
0130 h_y[n] = 415;
0131 h_adc[n] = 2500;
0132 ++n;
0133 h_id[n] = id;
0134 h_x[n] = x + 80;
0135 h_y[n] = 414;
0136 h_adc[n] = 2500;
0137 ++n;
0138
0139
0140 ++ncl;
0141 for (int x = 20; x < 25; ++x) {
0142 h_id[n] = id;
0143 h_x[n] = x;
0144 h_y[n] = x;
0145 h_adc[n] = 1000;
0146 ++n;
0147 }
0148 ++ncl;
0149
0150 for (int x = 45; x > 40; --x) {
0151 h_id[n] = id;
0152 h_x[n] = x;
0153 h_y[n] = x;
0154 h_adc[n] = 1000;
0155 ++n;
0156 }
0157 ++ncl;
0158 h_id[n++] = invalidModuleId;
0159
0160 int xx[5] = {21, 25, 23, 24, 22};
0161 for (int k = 0; k < 5; ++k) {
0162 h_id[n] = id;
0163 h_x[n] = xx[k];
0164 h_y[n] = 20 + xx[k];
0165 h_adc[n] = 1000;
0166 ++n;
0167 }
0168
0169 ++ncl;
0170 for (int k = 0; k < 5; ++k) {
0171 h_id[n] = id;
0172 h_x[n] = xx[k];
0173 h_y[n] = 100;
0174 h_adc[n] = kn == 2 ? 100 : 1000;
0175 ++n;
0176 if (xx[k] % 2 == 0) {
0177 h_id[n] = id;
0178 h_x[n] = xx[k];
0179 h_y[n] = 101;
0180 h_adc[n] = 1000;
0181 ++n;
0182 }
0183 }
0184 }
0185 {
0186
0187 int id = 0;
0188 int x = 10;
0189 ++ncl;
0190 h_id[n] = id;
0191 h_x[n] = x;
0192 h_y[n] = x;
0193 h_adc[n] = 5000;
0194 ++n;
0195 }
0196
0197 for (int id = 11; id <= 1800; id += 2) {
0198 if ((id / 20) % 2)
0199 h_id[n++] = invalidModuleId;
0200 for (int x = 0; x < 40; x += 4) {
0201 ++ncl;
0202 if ((id / 10) % 2) {
0203 for (int k = 0; k < 10; ++k) {
0204 h_id[n] = id;
0205 h_x[n] = x;
0206 h_y[n] = x + y[k];
0207 h_adc[n] = 100;
0208 ++n;
0209 h_id[n] = id;
0210 h_x[n] = x + 1;
0211 h_y[n] = x + y[k] + 2;
0212 h_adc[n] = 1000;
0213 ++n;
0214 }
0215 } else {
0216 for (int k = 0; k < 10; ++k) {
0217 h_id[n] = id;
0218 h_x[n] = x;
0219 h_y[n] = x + y[9 - k];
0220 h_adc[n] = kn == 2 ? 10 : 1000;
0221 ++n;
0222 if (y[k] == 3)
0223 continue;
0224 if (id == 51) {
0225 h_id[n++] = invalidModuleId;
0226 h_id[n++] = invalidModuleId;
0227 }
0228 h_id[n] = id;
0229 h_x[n] = x + 1;
0230 h_y[n] = x + y[k] + 2;
0231 h_adc[n] = kn == 2 ? 10 : 1000;
0232 ++n;
0233 }
0234 }
0235 }
0236 }
0237 };
0238 for (auto kkk = 0; kkk < 5; ++kkk) {
0239 n = 0;
0240 ncl = 0;
0241 generateClusters(kkk);
0242
0243 std::cout << "created " << n << " digis in " << ncl << " clusters" << std::endl;
0244 assert(n <= numElements);
0245
0246 uint32_t nModules = 0;
0247 #ifdef __CUDACC__
0248 size_t size32 = n * sizeof(unsigned int);
0249 size_t size16 = n * sizeof(unsigned short);
0250
0251
0252 cudaCheck(cudaMemcpy(d_moduleStart.get(), &nModules, sizeof(uint32_t), cudaMemcpyHostToDevice));
0253 cudaCheck(cudaMemcpy(d_id.get(), h_id.get(), size16, cudaMemcpyHostToDevice));
0254 cudaCheck(cudaMemcpy(d_x.get(), h_x.get(), size16, cudaMemcpyHostToDevice));
0255 cudaCheck(cudaMemcpy(d_y.get(), h_y.get(), size16, cudaMemcpyHostToDevice));
0256 cudaCheck(cudaMemcpy(d_adc.get(), h_adc.get(), size16, cudaMemcpyHostToDevice));
0257
0258
0259 int threadsPerBlock = (kkk == 5) ? 512 : ((kkk == 3) ? 128 : 256);
0260 int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
0261 std::cout << "CUDA countModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock
0262 << " threads\n";
0263
0264 cms::cuda::launch(
0265 countModules<Phase1>, {blocksPerGrid, threadsPerBlock}, d_id.get(), d_moduleStart.get(), d_clus.get(), n);
0266
0267 blocksPerGrid = maxNumModules;
0268
0269 std::cout << "CUDA findModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock
0270 << " threads\n";
0271 cudaCheck(cudaMemset(d_clusInModule.get(), 0, maxNumModules * sizeof(uint32_t)));
0272
0273 cms::cuda::launch(findClus<Phase1>,
0274 {blocksPerGrid, threadsPerBlock},
0275 d_raw.get(),
0276 d_id.get(),
0277 d_x.get(),
0278 d_y.get(),
0279 d_moduleStart.get(),
0280 d_clusInModule.get(),
0281 d_moduleId.get(),
0282 d_clus.get(),
0283 n);
0284 cudaDeviceSynchronize();
0285 cudaCheck(cudaMemcpy(&nModules, d_moduleStart.get(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
0286
0287 uint32_t nclus[maxNumModules], moduleId[nModules];
0288 cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), maxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
0289
0290 std::cout << "before charge cut found " << std::accumulate(nclus, nclus + maxNumModules, 0) << " clusters"
0291 << std::endl;
0292 for (auto i = maxNumModules; i > 0; i--)
0293 if (nclus[i - 1] > 0) {
0294 std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
0295 break;
0296 }
0297 if (ncl != std::accumulate(nclus, nclus + maxNumModules, 0))
0298 std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
0299
0300 cms::cuda::launch(clusterChargeCut<Phase1>,
0301 {blocksPerGrid, threadsPerBlock},
0302 clusterThresholds,
0303 d_id.get(),
0304 d_adc.get(),
0305 d_moduleStart.get(),
0306 d_clusInModule.get(),
0307 d_moduleId.get(),
0308 d_clus.get(),
0309 n);
0310
0311 cudaDeviceSynchronize();
0312 #else
0313 h_moduleStart[0] = nModules;
0314 countModules<Phase1>(h_id.get(), h_moduleStart.get(), h_clus.get(), n);
0315 memset(h_clusInModule.get(), 0, maxNumModules * sizeof(uint32_t));
0316
0317 findClus<Phase1>(h_raw.get(),
0318 h_id.get(),
0319 h_x.get(),
0320 h_y.get(),
0321 h_moduleStart.get(),
0322 h_clusInModule.get(),
0323 h_moduleId.get(),
0324 h_clus.get(),
0325 n);
0326
0327 nModules = h_moduleStart[0];
0328 auto nclus = h_clusInModule.get();
0329
0330 std::cout << "before charge cut found " << std::accumulate(nclus, nclus + maxNumModules, 0) << " clusters"
0331 << std::endl;
0332 for (auto i = maxNumModules; i > 0; i--)
0333 if (nclus[i - 1] > 0) {
0334 std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
0335 break;
0336 }
0337 if (ncl != std::accumulate(nclus, nclus + maxNumModules, 0))
0338 std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
0339
0340 clusterChargeCut<Phase1>(clusterThresholds,
0341 h_id.get(),
0342 h_adc.get(),
0343 h_moduleStart.get(),
0344 h_clusInModule.get(),
0345 h_moduleId.get(),
0346 h_clus.get(),
0347 n);
0348 #endif
0349
0350 std::cout << "found " << nModules << " Modules active" << std::endl;
0351
0352 #ifdef __CUDACC__
0353 cudaCheck(cudaMemcpy(h_id.get(), d_id.get(), size16, cudaMemcpyDeviceToHost));
0354 cudaCheck(cudaMemcpy(h_clus.get(), d_clus.get(), size32, cudaMemcpyDeviceToHost));
0355 cudaCheck(cudaMemcpy(&nclus, d_clusInModule.get(), maxNumModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
0356 cudaCheck(cudaMemcpy(&moduleId, d_moduleId.get(), nModules * sizeof(uint32_t), cudaMemcpyDeviceToHost));
0357 #endif
0358
0359 std::set<unsigned int> clids;
0360 for (int i = 0; i < n; ++i) {
0361 assert(h_id[i] != 666);
0362 if (h_id[i] == invalidModuleId)
0363 continue;
0364 assert(h_clus[i] >= 0);
0365 assert(h_clus[i] < int(nclus[h_id[i]]));
0366 clids.insert(h_id[i] * 1000 + h_clus[i]);
0367
0368 }
0369
0370
0371 auto p = clids.begin();
0372 auto cmid = (*p) / 1000;
0373 assert(0 == (*p) % 1000);
0374 auto c = p;
0375 ++c;
0376 std::cout << "first clusters " << *p << ' ' << *c << ' ' << nclus[cmid] << ' ' << nclus[(*c) / 1000] << std::endl;
0377 std::cout << "last cluster " << *clids.rbegin() << ' ' << nclus[(*clids.rbegin()) / 1000] << std::endl;
0378 for (; c != clids.end(); ++c) {
0379 auto cc = *c;
0380 auto pp = *p;
0381 auto mid = cc / 1000;
0382 auto pnc = pp % 1000;
0383 auto nc = cc % 1000;
0384 if (mid != cmid) {
0385 assert(0 == cc % 1000);
0386 assert(nclus[cmid] - 1 == pp % 1000);
0387
0388 cmid = mid;
0389 p = c;
0390 continue;
0391 }
0392 p = c;
0393
0394 if (nc != pnc + 1)
0395 std::cout << "error " << mid << ": " << nc << ' ' << pnc << std::endl;
0396 }
0397
0398 std::cout << "found " << std::accumulate(nclus, nclus + maxNumModules, 0) << ' ' << clids.size() << " clusters"
0399 << std::endl;
0400 for (auto i = maxNumModules; i > 0; i--)
0401 if (nclus[i - 1] > 0) {
0402 std::cout << "last module is " << i - 1 << ' ' << nclus[i - 1] << std::endl;
0403 break;
0404 }
0405
0406 }
0407 return 0;
0408 }