Warning, /RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu is written in an unsupported language. File is not indexed.
0001 /* Sushil Dubey, Shashi Dugad, TIFR, July 2017
0002 *
0003 * File Name: RawToClusterGPU.cu
0004 * Description: It converts Raw data into Digi Format on GPU
0005 * Finaly the Output of RawToDigi data is given to pixelClusterizer
0006 **/
0007
0008 // C++ includes
0009 #include <cassert>
0010 #include <cstdio>
0011 #include <cstdlib>
0012 #include <cstring>
0013 #include <fstream>
0014 #include <iomanip>
0015 #include <iostream>
0016
0017 // CUDA includes
0018 #include <cuda_runtime.h>
0019
0020 // CMSSW includes
0021 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
0022 #include "CondFormats/SiPixelObjects/interface/SiPixelROCsStatusAndMapping.h"
0023 #include "DataFormats/FEDRawData/interface/FEDNumbering.h"
0024 #include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
0025 #include "DataFormats/SiPixelDigi/interface/SiPixelDigiConstants.h"
0026 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0027 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0028 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
0029
0030 // local includes
0031 #include "SiPixelRawToClusterGPUKernel.h"
0032 #include "gpuCalibPixel.h"
0033 #include "gpuClusterChargeCut.h"
0034 #include "gpuClustering.h"
0035
0036 //#define GPU_DEBUG
0037
0038 namespace pixelgpudetails {
0039
0040 __device__ bool isBarrel(uint32_t rawId) {
0041 return (PixelSubdetector::PixelBarrel == ((rawId >> DetId::kSubdetOffset) & DetId::kSubdetMask));
0042 }
0043
0044 __device__ pixelgpudetails::DetIdGPU getRawId(const SiPixelROCsStatusAndMapping *cablingMap,
0045 uint8_t fed,
0046 uint32_t link,
0047 uint32_t roc) {
0048 uint32_t index = fed * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + roc;
0049 pixelgpudetails::DetIdGPU detId = {
0050 cablingMap->rawId[index], cablingMap->rocInDet[index], cablingMap->moduleId[index]};
0051 return detId;
0052 }
0053
0054 //reference http://cmsdoxygen.web.cern.ch/cmsdoxygen/CMSSW_9_2_0/doc/html/dd/d31/FrameConversion_8cc_source.html
0055 //http://cmslxr.fnal.gov/source/CondFormats/SiPixelObjects/src/PixelROC.cc?v=CMSSW_9_2_0#0071
0056 // Convert local pixel to pixelgpudetails::global pixel
0057 __device__ pixelgpudetails::Pixel frameConversion(
0058 bool bpix, int side, uint32_t layer, uint32_t rocIdInDetUnit, pixelgpudetails::Pixel local) {
0059 int slopeRow = 0, slopeCol = 0;
0060 int rowOffset = 0, colOffset = 0;
0061
0062 if (bpix) {
0063 if (side == -1 && layer != 1) { // -Z side: 4 non-flipped modules oriented like 'dddd', except Layer 1
0064 if (rocIdInDetUnit < 8) {
0065 slopeRow = 1;
0066 slopeCol = -1;
0067 rowOffset = 0;
0068 colOffset = (8 - rocIdInDetUnit) * pixelgpudetails::numColsInRoc - 1;
0069 } else {
0070 slopeRow = -1;
0071 slopeCol = 1;
0072 rowOffset = 2 * pixelgpudetails::numRowsInRoc - 1;
0073 colOffset = (rocIdInDetUnit - 8) * pixelgpudetails::numColsInRoc;
0074 } // if roc
0075 } else { // +Z side: 4 non-flipped modules oriented like 'pppp', but all 8 in layer1
0076 if (rocIdInDetUnit < 8) {
0077 slopeRow = -1;
0078 slopeCol = 1;
0079 rowOffset = 2 * pixelgpudetails::numRowsInRoc - 1;
0080 colOffset = rocIdInDetUnit * pixelgpudetails::numColsInRoc;
0081 } else {
0082 slopeRow = 1;
0083 slopeCol = -1;
0084 rowOffset = 0;
0085 colOffset = (16 - rocIdInDetUnit) * pixelgpudetails::numColsInRoc - 1;
0086 }
0087 }
0088
0089 } else { // fpix
0090 if (side == -1) { // pannel 1
0091 if (rocIdInDetUnit < 8) {
0092 slopeRow = 1;
0093 slopeCol = -1;
0094 rowOffset = 0;
0095 colOffset = (8 - rocIdInDetUnit) * pixelgpudetails::numColsInRoc - 1;
0096 } else {
0097 slopeRow = -1;
0098 slopeCol = 1;
0099 rowOffset = 2 * pixelgpudetails::numRowsInRoc - 1;
0100 colOffset = (rocIdInDetUnit - 8) * pixelgpudetails::numColsInRoc;
0101 }
0102 } else { // pannel 2
0103 if (rocIdInDetUnit < 8) {
0104 slopeRow = 1;
0105 slopeCol = -1;
0106 rowOffset = 0;
0107 colOffset = (8 - rocIdInDetUnit) * pixelgpudetails::numColsInRoc - 1;
0108 } else {
0109 slopeRow = -1;
0110 slopeCol = 1;
0111 rowOffset = 2 * pixelgpudetails::numRowsInRoc - 1;
0112 colOffset = (rocIdInDetUnit - 8) * pixelgpudetails::numColsInRoc;
0113 }
0114
0115 } // side
0116 }
0117
0118 uint32_t gRow = rowOffset + slopeRow * local.row;
0119 uint32_t gCol = colOffset + slopeCol * local.col;
0120 // inside frameConversion row: gRow, column: gCol
0121 pixelgpudetails::Pixel global = {gRow, gCol};
0122 return global;
0123 }
0124
0125 // error decoding and handling copied from EventFilter/SiPixelRawToDigi/src/ErrorChecker.cc
0126 template <bool debug = false>
0127 __device__ uint8_t conversionError(uint8_t fedId, uint8_t status) {
0128 uint8_t errorType = 0;
0129
0130 switch (status) {
0131 case (1): {
0132 if constexpr (debug)
0133 printf("Error in Fed: %i, invalid channel Id (errorType = 35\n)", fedId);
0134 errorType = 35;
0135 break;
0136 }
0137 case (2): {
0138 if constexpr (debug)
0139 printf("Error in Fed: %i, invalid ROC Id (errorType = 36)\n", fedId);
0140 errorType = 36;
0141 break;
0142 }
0143 case (3): {
0144 if constexpr (debug)
0145 printf("Error in Fed: %i, invalid dcol/pixel value (errorType = 37)\n", fedId);
0146 errorType = 37;
0147 break;
0148 }
0149 case (4): {
0150 if constexpr (debug)
0151 printf("Error in Fed: %i, dcol/pixel read out of order (errorType = 38)\n", fedId);
0152 errorType = 38;
0153 break;
0154 }
0155 default:
0156 if constexpr (debug)
0157 printf("Cabling check returned unexpected result, status = %i\n", status);
0158 };
0159
0160 return errorType;
0161 }
0162
0163 __device__ bool rocRowColIsValid(uint32_t rocRow, uint32_t rocCol) {
0164 /// row and column in ROC representation
0165 return ((rocRow < pixelgpudetails::numRowsInRoc) & (rocCol < pixelgpudetails::numColsInRoc));
0166 }
0167
0168 __device__ bool dcolIsValid(uint32_t dcol, uint32_t pxid) { return ((dcol < 26) & (2 <= pxid) & (pxid < 162)); }
0169
0170 // error decoding and handling copied from EventFilter/SiPixelRawToDigi/src/ErrorChecker.cc
0171 template <bool debug = false>
0172 __device__ uint8_t
0173 checkROC(uint32_t errorWord, uint8_t fedId, uint32_t link, const SiPixelROCsStatusAndMapping *cablingMap) {
0174 uint8_t errorType = (errorWord >> sipixelconstants::ROC_shift) & sipixelconstants::ERROR_mask;
0175 if (errorType < 25)
0176 return 0;
0177 bool errorFound = false;
0178
0179 switch (errorType) {
0180 case (25): {
0181 errorFound = true;
0182 uint32_t index = fedId * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + 1;
0183 if (index > 1 && index <= cablingMap->size) {
0184 if (!(link == cablingMap->link[index] && 1 == cablingMap->roc[index]))
0185 errorFound = false;
0186 }
0187 if constexpr (debug)
0188 if (errorFound)
0189 printf("Invalid ROC = 25 found (errorType = 25)\n");
0190 break;
0191 }
0192 case (26): {
0193 if constexpr (debug)
0194 printf("Gap word found (errorType = 26)\n");
0195 break;
0196 }
0197 case (27): {
0198 if constexpr (debug)
0199 printf("Dummy word found (errorType = 27)\n");
0200 break;
0201 }
0202 case (28): {
0203 if constexpr (debug)
0204 printf("Error fifo nearly full (errorType = 28)\n");
0205 errorFound = true;
0206 break;
0207 }
0208 case (29): {
0209 if constexpr (debug)
0210 printf("Timeout on a channel (errorType = 29)\n");
0211 if (!((errorWord >> sipixelconstants::OMIT_ERR_shift) & sipixelconstants::OMIT_ERR_mask)) {
0212 if constexpr (debug)
0213 printf("...2nd errorType=29 error, skip\n");
0214 break;
0215 }
0216 errorFound = true;
0217 break;
0218 }
0219 case (30): {
0220 if constexpr (debug)
0221 printf("TBM error trailer (errorType = 30)\n");
0222 int stateMatch_bits = 4;
0223 int stateMatch_shift = 8;
0224 uint32_t stateMatch_mask = ~(~uint32_t(0) << stateMatch_bits);
0225 int stateMatch = (errorWord >> stateMatch_shift) & stateMatch_mask;
0226 if (stateMatch != 1 && stateMatch != 8) {
0227 if constexpr (debug)
0228 printf("FED error 30 with unexpected State Bits (errorType = 30)\n");
0229 break;
0230 }
0231 if (stateMatch == 1)
0232 errorType = 40; // 1=Overflow -> 40, 8=number of ROCs -> 30
0233 errorFound = true;
0234 break;
0235 }
0236 case (31): {
0237 if constexpr (debug)
0238 printf("Event number error (errorType = 31)\n");
0239 errorFound = true;
0240 break;
0241 }
0242 default:
0243 errorFound = false;
0244 };
0245
0246 return errorFound ? errorType : 0;
0247 }
0248
0249 // error decoding and handling copied from EventFilter/SiPixelRawToDigi/src/ErrorChecker.cc
0250 template <bool debug = false>
0251 __device__ uint32_t
0252 getErrRawID(uint8_t fedId, uint32_t errWord, uint32_t errorType, const SiPixelROCsStatusAndMapping *cablingMap) {
0253 uint32_t rID = 0xffffffff;
0254
0255 switch (errorType) {
0256 case 25:
0257 case 29:
0258 case 30:
0259 case 31:
0260 case 36:
0261 case 40: {
0262 uint32_t roc = 1;
0263 uint32_t link = sipixelconstants::getLink(errWord);
0264 uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
0265 if (rID_temp != gpuClustering::invalidModuleId)
0266 rID = rID_temp;
0267 break;
0268 }
0269 case 37:
0270 case 38: {
0271 uint32_t roc = sipixelconstants::getROC(errWord);
0272 uint32_t link = sipixelconstants::getLink(errWord);
0273 uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
0274 if (rID_temp != gpuClustering::invalidModuleId)
0275 rID = rID_temp;
0276 break;
0277 }
0278 default:
0279 break;
0280 };
0281
0282 return rID;
0283 }
0284
0285 // Kernel to perform Raw to Digi conversion
0286 template <bool debug = false>
0287 __global__ void RawToDigi_kernel(const SiPixelROCsStatusAndMapping *cablingMap,
0288 const unsigned char *modToUnp,
0289 const uint32_t wordCounter,
0290 const uint32_t *word,
0291 const uint8_t *fedIds,
0292 SiPixelDigisSoA::View digisView,
0293 cms::cuda::SimpleVector<SiPixelErrorCompact> *err,
0294 bool useQualityInfo,
0295 bool includeErrors) {
0296 //if (threadIdx.x==0) printf("Event: %u blockIdx.x: %u start: %u end: %u\n", eventno, blockIdx.x, begin, end);
0297
0298 int32_t first = threadIdx.x + blockIdx.x * blockDim.x;
0299 for (int32_t iloop = first, nend = wordCounter; iloop < nend; iloop += blockDim.x * gridDim.x) {
0300 auto gIndex = iloop;
0301 auto dvgi = digisView[gIndex];
0302 dvgi.xx() = 0;
0303 dvgi.yy() = 0;
0304 dvgi.adc() = 0;
0305 bool skipROC = false;
0306
0307 uint8_t fedId = fedIds[gIndex / 2]; // +1200;
0308
0309 // initialize (too many coninue below)
0310 dvgi.pdigi() = 0;
0311 dvgi.rawIdArr() = 0;
0312 dvgi.moduleId() = gpuClustering::invalidModuleId;
0313
0314 uint32_t ww = word[gIndex]; // Array containing 32 bit raw data
0315 if (ww == 0) {
0316 // 0 is an indicator of a noise/dead channel, skip these pixels during clusterization
0317 continue;
0318 }
0319
0320 uint32_t link = sipixelconstants::getLink(ww); // Extract link
0321 uint32_t roc = sipixelconstants::getROC(ww); // Extract ROC in link
0322
0323 uint8_t errorType = checkROC<debug>(ww, fedId, link, cablingMap);
0324 skipROC = (roc < pixelgpudetails::maxROCIndex) ? false : (errorType != 0);
0325 if (includeErrors and skipROC) {
0326 uint32_t rID = getErrRawID<debug>(fedId, ww, errorType, cablingMap);
0327 if (rID != 0xffffffff) // store errors only for valid DetIds
0328 err->push_back(SiPixelErrorCompact{rID, ww, errorType, fedId});
0329 continue;
0330 }
0331
0332 // check for spurious channels
0333 if (roc > MAX_ROC or link > MAX_LINK) {
0334 uint32_t rawId = getRawId(cablingMap, fedId, link, 1).rawId;
0335 if constexpr (debug) {
0336 printf("spurious roc %d found on link %d, detector %d (index %d)\n", roc, link, rawId, gIndex);
0337 }
0338 if (roc > MAX_ROC and roc < 25) {
0339 uint8_t error = conversionError<debug>(fedId, 2);
0340 err->push_back(SiPixelErrorCompact{rawId, ww, error, fedId});
0341 }
0342 continue;
0343 }
0344
0345 uint32_t index = fedId * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + roc;
0346 if (useQualityInfo) {
0347 skipROC = cablingMap->badRocs[index];
0348 if (skipROC)
0349 continue;
0350 }
0351 skipROC = modToUnp[index];
0352 if (skipROC)
0353 continue;
0354
0355 pixelgpudetails::DetIdGPU detId = getRawId(cablingMap, fedId, link, roc);
0356 uint32_t rawId = detId.rawId;
0357 uint32_t layer = 0;
0358 int side = 0, panel = 0, module = 0;
0359 bool barrel = isBarrel(rawId);
0360 if (barrel) {
0361 layer = (rawId >> pixelgpudetails::layerStartBit) & pixelgpudetails::layerMask;
0362 module = (rawId >> pixelgpudetails::moduleStartBit) & pixelgpudetails::moduleMask;
0363 side = (module < 5) ? -1 : 1;
0364 } else {
0365 // endcap ids
0366 layer = 0;
0367 panel = (rawId >> pixelgpudetails::panelStartBit) & pixelgpudetails::panelMask;
0368 side = (panel == 1) ? -1 : 1;
0369 }
0370
0371 // ***special case of layer to 1 be handled here
0372 pixelgpudetails::Pixel localPix;
0373 if (layer == 1) {
0374 uint32_t col = sipixelconstants::getCol(ww);
0375 uint32_t row = sipixelconstants::getRow(ww);
0376 localPix.row = row;
0377 localPix.col = col;
0378 if (includeErrors) {
0379 if (not rocRowColIsValid(row, col)) {
0380 uint8_t error = conversionError<debug>(fedId, 3); //use the device function and fill the arrays
0381 err->push_back(SiPixelErrorCompact{rawId, ww, error, fedId});
0382 if constexpr (debug)
0383 printf("BPIX1 Error status: %i\n", error);
0384 continue;
0385 }
0386 }
0387 } else {
0388 // ***conversion rules for dcol and pxid
0389 uint32_t dcol = sipixelconstants::getDCol(ww);
0390 uint32_t pxid = sipixelconstants::getPxId(ww);
0391 uint32_t row = pixelgpudetails::numRowsInRoc - pxid / 2;
0392 uint32_t col = dcol * 2 + pxid % 2;
0393 localPix.row = row;
0394 localPix.col = col;
0395 if (includeErrors and not dcolIsValid(dcol, pxid)) {
0396 uint8_t error = conversionError<debug>(fedId, 3);
0397 err->push_back(SiPixelErrorCompact{rawId, ww, error, fedId});
0398 if constexpr (debug)
0399 printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc);
0400 continue;
0401 }
0402 }
0403
0404 pixelgpudetails::Pixel globalPix = frameConversion(barrel, side, layer, detId.rocInDet, localPix);
0405 dvgi.xx() = globalPix.row; // origin shifting by 1 0-159
0406 dvgi.yy() = globalPix.col; // origin shifting by 1 0-415
0407 dvgi.adc() = sipixelconstants::getADC(ww);
0408 dvgi.pdigi() = pixelgpudetails::pack(globalPix.row, globalPix.col, dvgi.adc());
0409 dvgi.moduleId() = detId.moduleId;
0410 dvgi.rawIdArr() = rawId;
0411 } // end of loop (gIndex < end)
0412
0413 } // end of Raw to Digi kernel
0414
0415 template <typename TrackerTraits>
0416 __global__ void fillHitsModuleStart(uint32_t const *__restrict__ clusInModule,
0417 uint32_t *__restrict__ moduleStart,
0418 uint32_t const *__restrict__ nModules,
0419 uint32_t *__restrict__ nModules_Clusters) {
0420 constexpr int nMaxModules = TrackerTraits::numberOfModules;
0421 constexpr int startBPIX2 = TrackerTraits::layerStart[1];
0422
0423 constexpr uint32_t maxHitsInModule = TrackerTraits::maxHitsInModule;
0424
0425 assert(startBPIX2 < nMaxModules);
0426 assert(nMaxModules < 4096); // easy to extend at least till 32*1024
0427 assert(nMaxModules > 1024);
0428
0429 assert(1 == gridDim.x);
0430 assert(0 == blockIdx.x);
0431
0432 int first = threadIdx.x;
0433
0434 // limit to MaxHitsInModule;
0435 for (int i = first, iend = nMaxModules; i < iend; i += blockDim.x) {
0436 moduleStart[i + 1] = std::min(maxHitsInModule, clusInModule[i]);
0437 }
0438
0439 constexpr bool isPhase2 = std::is_base_of<pixelTopology::Phase2, TrackerTraits>::value;
0440 __shared__ uint32_t ws[32];
0441 cms::cuda::blockPrefixScan(moduleStart + 1, moduleStart + 1, 1024, ws);
0442 constexpr int lastModules = isPhase2 ? 1024 : nMaxModules - 1024;
0443 cms::cuda::blockPrefixScan(moduleStart + 1024 + 1, moduleStart + 1024 + 1, lastModules, ws);
0444
0445 if constexpr (isPhase2) {
0446 cms::cuda::blockPrefixScan(moduleStart + 2048 + 1, moduleStart + 2048 + 1, 1024, ws);
0447 cms::cuda::blockPrefixScan(moduleStart + 3072 + 1, moduleStart + 3072 + 1, nMaxModules - 3072, ws);
0448 }
0449
0450 for (int i = first + 1025, iend = isPhase2 ? 2049 : nMaxModules + 1; i < iend; i += blockDim.x) {
0451 moduleStart[i] += moduleStart[1024];
0452 }
0453 __syncthreads();
0454
0455 if constexpr (isPhase2) {
0456 for (int i = first + 2049, iend = 3073; i < iend; i += blockDim.x) {
0457 moduleStart[i] += moduleStart[2048];
0458 }
0459 __syncthreads();
0460 for (int i = first + 3073, iend = nMaxModules + 1; i < iend; i += blockDim.x) {
0461 moduleStart[i] += moduleStart[3072];
0462 }
0463 __syncthreads();
0464 }
0465
0466 if (threadIdx.x == 0) {
0467 // copy the number of modules
0468 nModules_Clusters[0] = *nModules;
0469 // last element holds the number of all clusters
0470 nModules_Clusters[1] = moduleStart[nMaxModules];
0471 // element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1)
0472 nModules_Clusters[2] = moduleStart[startBPIX2];
0473 }
0474
0475 #ifdef GPU_DEBUG
0476 uint16_t maxH = isPhase2 ? 3027 : 1024;
0477 assert(0 == moduleStart[0]);
0478 auto c0 = std::min(maxHitsInModule, clusInModule[0]);
0479 assert(c0 == moduleStart[1]);
0480 assert(moduleStart[maxH] >= moduleStart[maxH - 1]);
0481 assert(moduleStart[maxH + 1] >= moduleStart[maxH]);
0482 assert(moduleStart[nMaxModules] >= moduleStart[maxH + 1]);
0483
0484 constexpr int startFP1 = TrackerTraits::numberOfModulesInBarrel;
0485 constexpr int startLastFwd = TrackerTraits::layerStart[TrackerTraits::numberOfLayers];
0486 for (int i = first, iend = nMaxModules + 1; i < iend; i += blockDim.x) {
0487 if (0 != i)
0488 assert(moduleStart[i] >= moduleStart[i - i]);
0489 // [BPX1, BPX2, BPX3, BPX4, FP1, FP2, FP3, FN1, FN2, FN3, LAST_VALID]
0490 // [ 0, 96, 320, 672, 1184, 1296, 1408, 1520, 1632, 1744, 1856]
0491 if (i == startBPIX2 || i == startFP1 || i == startLastFwd || i == nMaxModules)
0492 printf("moduleStart %d %d\n", i, moduleStart[i]);
0493 }
0494
0495 #endif
0496 }
0497
0498 // Interface to outside
0499 template <typename TrackerTraits>
0500 void SiPixelRawToClusterGPUKernel<TrackerTraits>::makePhase1ClustersAsync(
0501 const SiPixelClusterThresholds clusterThresholds,
0502 const SiPixelROCsStatusAndMapping *cablingMap,
0503 const unsigned char *modToUnp,
0504 const SiPixelGainForHLTonGPU *gains,
0505 const WordFedAppender &wordFed,
0506 SiPixelFormatterErrors &&errors,
0507 const uint32_t wordCounter,
0508 const uint32_t fedCounter,
0509 bool useQualityInfo,
0510 bool includeErrors,
0511 bool debug,
0512 cudaStream_t stream) {
0513 // we're not opting for calling this function in case of early events
0514 assert(wordCounter != 0);
0515 nDigis = wordCounter;
0516
0517 #ifdef GPU_DEBUG
0518 std::cout << "decoding " << wordCounter << " digis." << std::endl;
0519 #endif
0520
0521 // since wordCounter != 0 we're not allocating 0 bytes,
0522 // digis_d = SiPixelDigisCUDA(wordCounter, stream);
0523 digis_d = SiPixelDigisCUDA(size_t(wordCounter), stream);
0524 if (includeErrors) {
0525 digiErrors_d = SiPixelDigiErrorsCUDA(wordCounter, std::move(errors), stream);
0526 }
0527 clusters_d = SiPixelClustersCUDA(TrackerTraits::numberOfModules, stream);
0528
0529 // Begin Raw2Digi block
0530 {
0531 const int threadsPerBlock = 512;
0532 const int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock; // fill it all
0533
0534 assert(0 == wordCounter % 2);
0535 // wordCounter is the total no of words in each event to be trasfered on device
0536 auto word_d = cms::cuda::make_device_unique<uint32_t[]>(wordCounter, stream);
0537 auto fedId_d = cms::cuda::make_device_unique<uint8_t[]>(wordCounter, stream);
0538
0539 cudaCheck(
0540 cudaMemcpyAsync(word_d.get(), wordFed.word(), wordCounter * sizeof(uint32_t), cudaMemcpyDefault, stream));
0541 cudaCheck(cudaMemcpyAsync(
0542 fedId_d.get(), wordFed.fedId(), wordCounter * sizeof(uint8_t) / 2, cudaMemcpyDefault, stream));
0543
0544 // Launch rawToDigi kernel
0545 if (debug)
0546 RawToDigi_kernel<true><<<blocks, threadsPerBlock, 0, stream>>>( //
0547 cablingMap,
0548 modToUnp,
0549 wordCounter,
0550 word_d.get(),
0551 fedId_d.get(),
0552 digis_d.view(),
0553 digiErrors_d.error(), // returns nullptr if default-constructed
0554 useQualityInfo,
0555 includeErrors);
0556 else
0557 RawToDigi_kernel<false><<<blocks, threadsPerBlock, 0, stream>>>( //
0558 cablingMap,
0559 modToUnp,
0560 wordCounter,
0561 word_d.get(),
0562 fedId_d.get(),
0563 digis_d.view(),
0564 digiErrors_d.error(), // returns nullptr if default-constructed
0565 useQualityInfo,
0566 includeErrors);
0567 cudaCheck(cudaGetLastError());
0568 #ifdef GPU_DEBUG
0569 cudaCheck(cudaStreamSynchronize(stream));
0570 #endif
0571
0572 if (includeErrors) {
0573 digiErrors_d.copyErrorToHostAsync(stream);
0574 }
0575 }
0576 // End of Raw2Digi and passing data for clustering
0577
0578 {
0579 // clusterizer ...
0580 using namespace gpuClustering;
0581 int threadsPerBlock = 256;
0582 int blocks =
0583 (std::max(int(wordCounter), int(TrackerTraits::numberOfModules)) + threadsPerBlock - 1) / threadsPerBlock;
0584
0585 gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, stream>>>(clusterThresholds,
0586 digis_d.view().moduleId(),
0587 digis_d.view().xx(),
0588 digis_d.view().yy(),
0589 digis_d.view().adc(),
0590 gains,
0591 wordCounter,
0592 clusters_d->moduleStart(),
0593 clusters_d->clusInModule(),
0594 clusters_d->clusModuleStart());
0595
0596 cudaCheck(cudaGetLastError());
0597 #ifdef GPU_DEBUG
0598 cudaCheck(cudaStreamSynchronize(stream));
0599 #endif
0600
0601 #ifdef GPU_DEBUG
0602 std::cout << "CUDA countModules kernel launch with " << blocks << " blocks of " << threadsPerBlock
0603 << " threads\n";
0604 #endif
0605
0606 countModules<TrackerTraits><<<blocks, threadsPerBlock, 0, stream>>>(
0607 digis_d->moduleId(), clusters_d->moduleStart(), digis_d->clus(), wordCounter);
0608 cudaCheck(cudaGetLastError());
0609
0610 // should be larger than maxPixInModule/16 aka (maxPixInModule/maxiter in the kernel)
0611 threadsPerBlock = ((TrackerTraits::maxPixInModule / 16 + 128 - 1) / 128) * 128;
0612 blocks = TrackerTraits::numberOfModules;
0613 #ifdef GPU_DEBUG
0614 std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
0615 #endif
0616
0617 findClus<TrackerTraits><<<blocks, threadsPerBlock, 0, stream>>>(digis_d->rawIdArr(),
0618 digis_d->moduleId(),
0619 digis_d->xx(),
0620 digis_d->yy(),
0621 clusters_d->moduleStart(),
0622 clusters_d->clusInModule(),
0623 clusters_d->moduleId(),
0624 digis_d->clus(),
0625 wordCounter);
0626
0627 cudaCheck(cudaGetLastError());
0628 #ifdef GPU_DEBUG
0629 cudaCheck(cudaStreamSynchronize(stream));
0630 #endif
0631
0632 // apply charge cut
0633 clusterChargeCut<TrackerTraits><<<blocks, threadsPerBlock, 0, stream>>>(clusterThresholds,
0634 digis_d->moduleId(),
0635 digis_d->adc(),
0636 clusters_d->moduleStart(),
0637 clusters_d->clusInModule(),
0638 clusters_d->moduleId(),
0639 digis_d->clus(),
0640 wordCounter);
0641
0642 cudaCheck(cudaGetLastError());
0643
0644 // count the module start indices already here (instead of
0645 // rechits) so that the number of clusters/hits can be made
0646 // available in the rechit producer without additional points of
0647 // synchronization/ExternalWork
0648 auto nModules_Clusters_d = cms::cuda::make_device_unique<uint32_t[]>(3, stream);
0649 // MUST be ONE block
0650 fillHitsModuleStart<TrackerTraits><<<1, 1024, 0, stream>>>(clusters_d->clusInModule(),
0651 clusters_d->clusModuleStart(),
0652 clusters_d->moduleStart(),
0653 nModules_Clusters_d.get());
0654
0655 // copy to host
0656 nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(3, stream);
0657 cudaCheck(cudaMemcpyAsync(
0658 nModules_Clusters_h.get(), nModules_Clusters_d.get(), 3 * sizeof(uint32_t), cudaMemcpyDefault, stream));
0659
0660 #ifdef GPU_DEBUG
0661 cudaCheck(cudaStreamSynchronize(stream));
0662 #endif
0663
0664 } // end clusterizer scope
0665 }
0666
0667 template <typename TrackerTraits>
0668 void SiPixelRawToClusterGPUKernel<TrackerTraits>::makePhase2ClustersAsync(
0669 const SiPixelClusterThresholds clusterThresholds,
0670 const uint16_t *moduleIds,
0671 const uint16_t *xDigis,
0672 const uint16_t *yDigis,
0673 const uint16_t *adcDigis,
0674 const uint32_t *packedData,
0675 const uint32_t *rawIds,
0676 const uint32_t numDigis,
0677 cudaStream_t stream) {
0678 using namespace gpuClustering;
0679 nDigis = numDigis;
0680 digis_d = SiPixelDigisCUDA(numDigis, stream);
0681
0682 cudaCheck(cudaMemcpyAsync(digis_d->moduleId(), moduleIds, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
0683 cudaCheck(cudaMemcpyAsync(digis_d->xx(), xDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
0684 cudaCheck(cudaMemcpyAsync(digis_d->yy(), yDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
0685 cudaCheck(cudaMemcpyAsync(digis_d->adc(), adcDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
0686 cudaCheck(cudaMemcpyAsync(digis_d->pdigi(), packedData, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream));
0687 cudaCheck(cudaMemcpyAsync(digis_d->rawIdArr(), rawIds, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream));
0688
0689 clusters_d = SiPixelClustersCUDA(TrackerTraits::numberOfModules, stream);
0690
0691 nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(2, stream);
0692
0693 int threadsPerBlock = 512;
0694 int blocks = (int(numDigis) + threadsPerBlock - 1) / threadsPerBlock;
0695
0696 gpuCalibPixel::calibDigisPhase2<<<blocks, threadsPerBlock, 0, stream>>>(clusterThresholds,
0697 digis_d->moduleId(),
0698 digis_d->adc(),
0699 numDigis,
0700 clusters_d->moduleStart(),
0701 clusters_d->clusInModule(),
0702 clusters_d->clusModuleStart());
0703
0704 cudaCheck(cudaGetLastError());
0705
0706 #ifdef GPU_DEBUG
0707 cudaCheck(cudaStreamSynchronize(stream));
0708 std::cout << "CUDA countModules kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
0709 #endif
0710
0711 countModules<TrackerTraits><<<blocks, threadsPerBlock, 0, stream>>>(
0712 digis_d->moduleId(), clusters_d->moduleStart(), digis_d->clus(), numDigis);
0713 cudaCheck(cudaGetLastError());
0714
0715 // read the number of modules into a data member, used by getProduct())
0716 cudaCheck(cudaMemcpyAsync(
0717 &(nModules_Clusters_h[0]), clusters_d->moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream));
0718
0719 threadsPerBlock = 256;
0720 blocks = TrackerTraits::numberOfModules;
0721
0722 #ifdef GPU_DEBUG
0723 cudaCheck(cudaStreamSynchronize(stream));
0724 std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
0725 #endif
0726 findClus<TrackerTraits><<<blocks, threadsPerBlock, 0, stream>>>(digis_d->rawIdArr(),
0727 digis_d->moduleId(),
0728 digis_d->xx(),
0729 digis_d->yy(),
0730 clusters_d->moduleStart(),
0731 clusters_d->clusInModule(),
0732 clusters_d->moduleId(),
0733 digis_d->clus(),
0734 numDigis);
0735
0736 cudaCheck(cudaGetLastError());
0737 #ifdef GPU_DEBUG
0738 cudaCheck(cudaStreamSynchronize(stream));
0739 std::cout << "CUDA clusterChargeCut kernel launch with " << blocks << " blocks of " << threadsPerBlock
0740 << " threads\n";
0741 #endif
0742
0743 // apply charge cut
0744 clusterChargeCut<TrackerTraits><<<blocks, threadsPerBlock, 0, stream>>>(clusterThresholds,
0745 digis_d->moduleId(),
0746 digis_d->adc(),
0747 clusters_d->moduleStart(),
0748 clusters_d->clusInModule(),
0749 clusters_d->moduleId(),
0750 digis_d->clus(),
0751 numDigis);
0752 cudaCheck(cudaGetLastError());
0753
0754 auto nModules_Clusters_d = cms::cuda::make_device_unique<uint32_t[]>(3, stream);
0755
0756 #ifdef GPU_DEBUG
0757 cudaCheck(cudaStreamSynchronize(stream));
0758 std::cout << "CUDA fillHitsModuleStart kernel launch \n";
0759 #endif
0760
0761 // MUST be ONE block
0762 fillHitsModuleStart<TrackerTraits><<<1, 1024, 0, stream>>>(clusters_d->clusInModule(),
0763 clusters_d->clusModuleStart(),
0764 clusters_d->moduleStart(),
0765 nModules_Clusters_d.get());
0766
0767 nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(3, stream);
0768 cudaCheck(cudaMemcpyAsync(
0769 nModules_Clusters_h.get(), nModules_Clusters_d.get(), 3 * sizeof(uint32_t), cudaMemcpyDefault, stream));
0770
0771 #ifdef GPU_DEBUG
0772 cudaCheck(cudaStreamSynchronize(stream));
0773 #endif
0774 } //
0775
0776 template class SiPixelRawToClusterGPUKernel<pixelTopology::Phase1>;
0777 template class SiPixelRawToClusterGPUKernel<pixelTopology::Phase2>;
0778 template class SiPixelRawToClusterGPUKernel<pixelTopology::HIonPhase1>;
0779 } // namespace pixelgpudetails