Warning, /RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu is written in an unsupported language. File is not indexed.
0001 /* Sushil Dubey, Shashi Dugad, TIFR, July 2017
0002 *
0003 * File Name: RawToClusterGPU.cu
0004 * Description: It converts Raw data into Digi Format on GPU
0005 * Finaly the Output of RawToDigi data is given to pixelClusterizer
0006 **/
0007
0008 // C++ includes
0009 #include <cassert>
0010 #include <cstdio>
0011 #include <cstdlib>
0012 #include <cstring>
0013 #include <fstream>
0014 #include <iomanip>
0015 #include <iostream>
0016
0017 // CUDA includes
0018 #include <cuda_runtime.h>
0019
0020 // CMSSW includes
0021 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
0022 #include "CondFormats/SiPixelObjects/interface/SiPixelROCsStatusAndMapping.h"
0023 #include "DataFormats/FEDRawData/interface/FEDNumbering.h"
0024 #include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
0025 #include "DataFormats/SiPixelDigi/interface/SiPixelDigiConstants.h"
0026 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0027 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0028 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
0029 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h"
0030 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
0031 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
0032 // local includes
0033 #include "SiPixelRawToClusterGPUKernel.h"
0034
0035 namespace pixelgpudetails {
0036
0037 __device__ bool isBarrel(uint32_t rawId) {
0038 return (PixelSubdetector::PixelBarrel == ((rawId >> DetId::kSubdetOffset) & DetId::kSubdetMask));
0039 }
0040
0041 __device__ pixelgpudetails::DetIdGPU getRawId(const SiPixelROCsStatusAndMapping *cablingMap,
0042 uint8_t fed,
0043 uint32_t link,
0044 uint32_t roc) {
0045 uint32_t index = fed * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + roc;
0046 pixelgpudetails::DetIdGPU detId = {
0047 cablingMap->rawId[index], cablingMap->rocInDet[index], cablingMap->moduleId[index]};
0048 return detId;
0049 }
0050
0051 //reference http://cmsdoxygen.web.cern.ch/cmsdoxygen/CMSSW_9_2_0/doc/html/dd/d31/FrameConversion_8cc_source.html
0052 //http://cmslxr.fnal.gov/source/CondFormats/SiPixelObjects/src/PixelROC.cc?v=CMSSW_9_2_0#0071
0053 // Convert local pixel to pixelgpudetails::global pixel
0054 __device__ pixelgpudetails::Pixel frameConversion(
0055 bool bpix, int side, uint32_t layer, uint32_t rocIdInDetUnit, pixelgpudetails::Pixel local) {
0056 int slopeRow = 0, slopeCol = 0;
0057 int rowOffset = 0, colOffset = 0;
0058
0059 if (bpix) {
0060 if (side == -1 && layer != 1) { // -Z side: 4 non-flipped modules oriented like 'dddd', except Layer 1
0061 if (rocIdInDetUnit < 8) {
0062 slopeRow = 1;
0063 slopeCol = -1;
0064 rowOffset = 0;
0065 colOffset = (8 - rocIdInDetUnit) * pixelgpudetails::numColsInRoc - 1;
0066 } else {
0067 slopeRow = -1;
0068 slopeCol = 1;
0069 rowOffset = 2 * pixelgpudetails::numRowsInRoc - 1;
0070 colOffset = (rocIdInDetUnit - 8) * pixelgpudetails::numColsInRoc;
0071 } // if roc
0072 } else { // +Z side: 4 non-flipped modules oriented like 'pppp', but all 8 in layer1
0073 if (rocIdInDetUnit < 8) {
0074 slopeRow = -1;
0075 slopeCol = 1;
0076 rowOffset = 2 * pixelgpudetails::numRowsInRoc - 1;
0077 colOffset = rocIdInDetUnit * pixelgpudetails::numColsInRoc;
0078 } else {
0079 slopeRow = 1;
0080 slopeCol = -1;
0081 rowOffset = 0;
0082 colOffset = (16 - rocIdInDetUnit) * pixelgpudetails::numColsInRoc - 1;
0083 }
0084 }
0085
0086 } else { // fpix
0087 if (side == -1) { // pannel 1
0088 if (rocIdInDetUnit < 8) {
0089 slopeRow = 1;
0090 slopeCol = -1;
0091 rowOffset = 0;
0092 colOffset = (8 - rocIdInDetUnit) * pixelgpudetails::numColsInRoc - 1;
0093 } else {
0094 slopeRow = -1;
0095 slopeCol = 1;
0096 rowOffset = 2 * pixelgpudetails::numRowsInRoc - 1;
0097 colOffset = (rocIdInDetUnit - 8) * pixelgpudetails::numColsInRoc;
0098 }
0099 } else { // pannel 2
0100 if (rocIdInDetUnit < 8) {
0101 slopeRow = 1;
0102 slopeCol = -1;
0103 rowOffset = 0;
0104 colOffset = (8 - rocIdInDetUnit) * pixelgpudetails::numColsInRoc - 1;
0105 } else {
0106 slopeRow = -1;
0107 slopeCol = 1;
0108 rowOffset = 2 * pixelgpudetails::numRowsInRoc - 1;
0109 colOffset = (rocIdInDetUnit - 8) * pixelgpudetails::numColsInRoc;
0110 }
0111
0112 } // side
0113 }
0114
0115 uint32_t gRow = rowOffset + slopeRow * local.row;
0116 uint32_t gCol = colOffset + slopeCol * local.col;
0117 // inside frameConversion row: gRow, column: gCol
0118 pixelgpudetails::Pixel global = {gRow, gCol};
0119 return global;
0120 }
0121
0122 // error decoding and handling copied from EventFilter/SiPixelRawToDigi/src/ErrorChecker.cc
0123 template <bool debug = false>
0124 __device__ uint8_t conversionError(uint8_t fedId, uint8_t status) {
0125 uint8_t errorType = 0;
0126
0127 switch (status) {
0128 case (1): {
0129 if constexpr (debug)
0130 printf("Error in Fed: %i, invalid channel Id (errorType = 35\n)", fedId);
0131 errorType = 35;
0132 break;
0133 }
0134 case (2): {
0135 if constexpr (debug)
0136 printf("Error in Fed: %i, invalid ROC Id (errorType = 36)\n", fedId);
0137 errorType = 36;
0138 break;
0139 }
0140 case (3): {
0141 if constexpr (debug)
0142 printf("Error in Fed: %i, invalid dcol/pixel value (errorType = 37)\n", fedId);
0143 errorType = 37;
0144 break;
0145 }
0146 case (4): {
0147 if constexpr (debug)
0148 printf("Error in Fed: %i, dcol/pixel read out of order (errorType = 38)\n", fedId);
0149 errorType = 38;
0150 break;
0151 }
0152 default:
0153 if constexpr (debug)
0154 printf("Cabling check returned unexpected result, status = %i\n", status);
0155 };
0156
0157 return errorType;
0158 }
0159
0160 __device__ bool rocRowColIsValid(uint32_t rocRow, uint32_t rocCol) {
0161 /// row and column in ROC representation
0162 return ((rocRow < pixelgpudetails::numRowsInRoc) & (rocCol < pixelgpudetails::numColsInRoc));
0163 }
0164
0165 __device__ bool dcolIsValid(uint32_t dcol, uint32_t pxid) { return ((dcol < 26) & (2 <= pxid) & (pxid < 162)); }
0166
0167 // error decoding and handling copied from EventFilter/SiPixelRawToDigi/src/ErrorChecker.cc
0168 template <bool debug = false>
0169 __device__ uint8_t
0170 checkROC(uint32_t errorWord, uint8_t fedId, uint32_t link, const SiPixelROCsStatusAndMapping *cablingMap) {
0171 uint8_t errorType = (errorWord >> sipixelconstants::ROC_shift) & sipixelconstants::ERROR_mask;
0172 if (errorType < 25)
0173 return 0;
0174 bool errorFound = false;
0175
0176 switch (errorType) {
0177 case (25): {
0178 errorFound = true;
0179 uint32_t index = fedId * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + 1;
0180 if (index > 1 && index <= cablingMap->size) {
0181 if (!(link == cablingMap->link[index] && 1 == cablingMap->roc[index]))
0182 errorFound = false;
0183 }
0184 if constexpr (debug)
0185 if (errorFound)
0186 printf("Invalid ROC = 25 found (errorType = 25)\n");
0187 break;
0188 }
0189 case (26): {
0190 if constexpr (debug)
0191 printf("Gap word found (errorType = 26)\n");
0192 errorFound = true;
0193 break;
0194 }
0195 case (27): {
0196 if constexpr (debug)
0197 printf("Dummy word found (errorType = 27)\n");
0198 errorFound = true;
0199 break;
0200 }
0201 case (28): {
0202 if constexpr (debug)
0203 printf("Error fifo nearly full (errorType = 28)\n");
0204 errorFound = true;
0205 break;
0206 }
0207 case (29): {
0208 if constexpr (debug)
0209 printf("Timeout on a channel (errorType = 29)\n");
0210 if ((errorWord >> sipixelconstants::OMIT_ERR_shift) & sipixelconstants::OMIT_ERR_mask) {
0211 if constexpr (debug)
0212 printf("...first errorType=29 error, this gets masked out\n");
0213 }
0214 errorFound = true;
0215 break;
0216 }
0217 case (30): {
0218 if constexpr (debug)
0219 printf("TBM error trailer (errorType = 30)\n");
0220 int stateMatch_bits = 4;
0221 int stateMatch_shift = 8;
0222 uint32_t stateMatch_mask = ~(~uint32_t(0) << stateMatch_bits);
0223 int stateMatch = (errorWord >> stateMatch_shift) & stateMatch_mask;
0224 if (stateMatch != 1 && stateMatch != 8) {
0225 if constexpr (debug)
0226 printf("FED error 30 with unexpected State Bits (errorType = 30)\n");
0227 }
0228 if (stateMatch == 1)
0229 errorType = 40; // 1=Overflow -> 40, 8=number of ROCs -> 30
0230 errorFound = true;
0231 break;
0232 }
0233 case (31): {
0234 if constexpr (debug)
0235 printf("Event number error (errorType = 31)\n");
0236 errorFound = true;
0237 break;
0238 }
0239 default:
0240 errorFound = false;
0241 };
0242
0243 return errorFound ? errorType : 0;
0244 }
0245
0246 // error decoding and handling copied from EventFilter/SiPixelRawToDigi/src/ErrorChecker.cc
0247 template <bool debug = false>
0248 __device__ uint32_t
0249 getErrRawID(uint8_t fedId, uint32_t errWord, uint32_t errorType, const SiPixelROCsStatusAndMapping *cablingMap) {
0250 uint32_t rID = 0xffffffff;
0251
0252 switch (errorType) {
0253 case 25:
0254 case 30:
0255 case 31:
0256 case 36:
0257 case 40: {
0258 uint32_t roc = 1;
0259 uint32_t link = sipixelconstants::getLink(errWord);
0260 uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
0261 if (rID_temp != gpuClustering::invalidModuleId)
0262 rID = rID_temp;
0263 break;
0264 }
0265 case 29: {
0266 int chanNmbr = 0;
0267 const int DB0_shift = 0;
0268 const int DB1_shift = DB0_shift + 1;
0269 const int DB2_shift = DB1_shift + 1;
0270 const int DB3_shift = DB2_shift + 1;
0271 const int DB4_shift = DB3_shift + 1;
0272 const uint32_t DataBit_mask = ~(~uint32_t(0) << 1);
0273
0274 int CH1 = (errWord >> DB0_shift) & DataBit_mask;
0275 int CH2 = (errWord >> DB1_shift) & DataBit_mask;
0276 int CH3 = (errWord >> DB2_shift) & DataBit_mask;
0277 int CH4 = (errWord >> DB3_shift) & DataBit_mask;
0278 int CH5 = (errWord >> DB4_shift) & DataBit_mask;
0279 int BLOCK_bits = 3;
0280 int BLOCK_shift = 8;
0281 uint32_t BLOCK_mask = ~(~uint32_t(0) << BLOCK_bits);
0282 int BLOCK = (errWord >> BLOCK_shift) & BLOCK_mask;
0283 int localCH = 1 * CH1 + 2 * CH2 + 3 * CH3 + 4 * CH4 + 5 * CH5;
0284 if (BLOCK % 2 == 0)
0285 chanNmbr = (BLOCK / 2) * 9 + localCH;
0286 else
0287 chanNmbr = ((BLOCK - 1) / 2) * 9 + 4 + localCH;
0288 if ((chanNmbr < 1) || (chanNmbr > 36))
0289 break; // signifies unexpected result
0290
0291 uint32_t roc = 1;
0292 uint32_t link = chanNmbr;
0293 uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
0294 if (rID_temp != gpuClustering::invalidModuleId)
0295 rID = rID_temp;
0296 break;
0297 }
0298 case 37:
0299 case 38: {
0300 uint32_t roc = sipixelconstants::getROC(errWord);
0301 uint32_t link = sipixelconstants::getLink(errWord);
0302 uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
0303 if (rID_temp != gpuClustering::invalidModuleId)
0304 rID = rID_temp;
0305 break;
0306 }
0307 default:
0308 break;
0309 };
0310
0311 return rID;
0312 }
0313
0314 // Kernel to perform Raw to Digi conversion
0315 template <bool debug = false>
0316 __global__ void RawToDigi_kernel(const SiPixelROCsStatusAndMapping *cablingMap,
0317 const unsigned char *modToUnp,
0318 const uint32_t wordCounter,
0319 const uint32_t *word,
0320 const uint8_t *fedIds,
0321 SiPixelDigisCUDASOAView digisView,
0322 cms::cuda::SimpleVector<SiPixelErrorCompact> *err,
0323 bool useQualityInfo,
0324 bool includeErrors) {
0325 //if (threadIdx.x==0) printf("Event: %u blockIdx.x: %u start: %u end: %u\n", eventno, blockIdx.x, begin, end);
0326
0327 int32_t first = threadIdx.x + blockIdx.x * blockDim.x;
0328 for (int32_t iloop = first, nend = wordCounter; iloop < nend; iloop += blockDim.x * gridDim.x) {
0329 auto gIndex = iloop;
0330 auto dvgi = digisView[gIndex];
0331 dvgi.xx() = 0;
0332 dvgi.yy() = 0;
0333 dvgi.adc() = 0;
0334 bool skipROC = false;
0335
0336 uint8_t fedId = fedIds[gIndex / 2]; // +1200;
0337
0338 // initialize (too many coninue below)
0339 dvgi.pdigi() = 0;
0340 dvgi.rawIdArr() = 0;
0341 dvgi.moduleId() = gpuClustering::invalidModuleId;
0342
0343 uint32_t ww = word[gIndex]; // Array containing 32 bit raw data
0344 if (ww == 0) {
0345 // 0 is an indicator of a noise/dead channel, skip these pixels during clusterization
0346 continue;
0347 }
0348
0349 uint32_t link = sipixelconstants::getLink(ww); // Extract link
0350 uint32_t roc = sipixelconstants::getROC(ww); // Extract ROC in link
0351
0352 uint8_t errorType = checkROC<debug>(ww, fedId, link, cablingMap);
0353 skipROC = (roc < pixelgpudetails::maxROCIndex) ? false : (errorType != 0);
0354 if (includeErrors and skipROC) {
0355 uint32_t rID = getErrRawID<debug>(fedId, ww, errorType, cablingMap);
0356 err->push_back(SiPixelErrorCompact{rID, ww, errorType, fedId});
0357 continue;
0358 }
0359
0360 // check for spurious channels
0361 if (roc > MAX_ROC or link > MAX_LINK) {
0362 if constexpr (debug) {
0363 printf("spurious roc %d found on link %d, detector %d (index %d)\n",
0364 roc,
0365 link,
0366 getRawId(cablingMap, fedId, link, 1).rawId,
0367 gIndex);
0368 }
0369 continue;
0370 }
0371
0372 uint32_t index = fedId * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + roc;
0373 if (useQualityInfo) {
0374 skipROC = cablingMap->badRocs[index];
0375 if (skipROC)
0376 continue;
0377 }
0378 skipROC = modToUnp[index];
0379 if (skipROC)
0380 continue;
0381
0382 pixelgpudetails::DetIdGPU detId = getRawId(cablingMap, fedId, link, roc);
0383 uint32_t rawId = detId.rawId;
0384 uint32_t layer = 0;
0385 int side = 0, panel = 0, module = 0;
0386 bool barrel = isBarrel(rawId);
0387 if (barrel) {
0388 layer = (rawId >> pixelgpudetails::layerStartBit) & pixelgpudetails::layerMask;
0389 module = (rawId >> pixelgpudetails::moduleStartBit) & pixelgpudetails::moduleMask;
0390 side = (module < 5) ? -1 : 1;
0391 } else {
0392 // endcap ids
0393 layer = 0;
0394 panel = (rawId >> pixelgpudetails::panelStartBit) & pixelgpudetails::panelMask;
0395 side = (panel == 1) ? -1 : 1;
0396 }
0397
0398 // ***special case of layer to 1 be handled here
0399 pixelgpudetails::Pixel localPix;
0400 if (layer == 1) {
0401 uint32_t col = sipixelconstants::getCol(ww);
0402 uint32_t row = sipixelconstants::getRow(ww);
0403 localPix.row = row;
0404 localPix.col = col;
0405 if (includeErrors) {
0406 if (not rocRowColIsValid(row, col)) {
0407 uint8_t error = conversionError<debug>(fedId, 3); //use the device function and fill the arrays
0408 err->push_back(SiPixelErrorCompact{rawId, ww, error, fedId});
0409 if constexpr (debug)
0410 printf("BPIX1 Error status: %i\n", error);
0411 continue;
0412 }
0413 }
0414 } else {
0415 // ***conversion rules for dcol and pxid
0416 uint32_t dcol = sipixelconstants::getDCol(ww);
0417 uint32_t pxid = sipixelconstants::getPxId(ww);
0418 uint32_t row = pixelgpudetails::numRowsInRoc - pxid / 2;
0419 uint32_t col = dcol * 2 + pxid % 2;
0420 localPix.row = row;
0421 localPix.col = col;
0422 if (includeErrors and not dcolIsValid(dcol, pxid)) {
0423 uint8_t error = conversionError<debug>(fedId, 3);
0424 err->push_back(SiPixelErrorCompact{rawId, ww, error, fedId});
0425 if constexpr (debug)
0426 printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc);
0427 continue;
0428 }
0429 }
0430
0431 pixelgpudetails::Pixel globalPix = frameConversion(barrel, side, layer, detId.rocInDet, localPix);
0432 dvgi.xx() = globalPix.row; // origin shifting by 1 0-159
0433 dvgi.yy() = globalPix.col; // origin shifting by 1 0-415
0434 dvgi.adc() = sipixelconstants::getADC(ww);
0435 dvgi.pdigi() = pixelgpudetails::pack(globalPix.row, globalPix.col, dvgi.adc());
0436 dvgi.moduleId() = detId.moduleId;
0437 dvgi.rawIdArr() = rawId;
0438 } // end of loop (gIndex < end)
0439
0440 } // end of Raw to Digi kernel
0441
0442 template <typename TrackerTraits>
0443 __global__ void fillHitsModuleStart(uint32_t const *__restrict__ clusInModule,
0444 uint32_t *__restrict__ moduleStart,
0445 uint32_t const *__restrict__ nModules,
0446 uint32_t *__restrict__ nModules_Clusters) {
0447 constexpr int nMaxModules = TrackerTraits::numberOfModules;
0448 constexpr int startBPIX2 = TrackerTraits::layerStart[1];
0449
0450 assert(startBPIX2 < nMaxModules);
0451 assert(nMaxModules < 4096); // easy to extend at least till 32*1024
0452 assert(nMaxModules > 1024);
0453
0454 assert(1 == gridDim.x);
0455 assert(0 == blockIdx.x);
0456
0457 int first = threadIdx.x;
0458
0459 // limit to MaxHitsInModule;
0460 for (int i = first, iend = nMaxModules; i < iend; i += blockDim.x) {
0461 moduleStart[i + 1] = std::min(gpuClustering::maxHitsInModule(), clusInModule[i]);
0462 }
0463
0464 constexpr bool isPhase2 = std::is_base_of<pixelTopology::Phase2, TrackerTraits>::value;
0465 __shared__ uint32_t ws[32];
0466 cms::cuda::blockPrefixScan(moduleStart + 1, moduleStart + 1, 1024, ws);
0467 constexpr int lastModules = isPhase2 ? 1024 : nMaxModules - 1024;
0468 cms::cuda::blockPrefixScan(moduleStart + 1024 + 1, moduleStart + 1024 + 1, lastModules, ws);
0469
0470 if constexpr (isPhase2) {
0471 cms::cuda::blockPrefixScan(moduleStart + 2048 + 1, moduleStart + 2048 + 1, 1024, ws);
0472 cms::cuda::blockPrefixScan(moduleStart + 3072 + 1, moduleStart + 3072 + 1, nMaxModules - 3072, ws);
0473 }
0474
0475 for (int i = first + 1025, iend = isPhase2 ? 2049 : nMaxModules + 1; i < iend; i += blockDim.x) {
0476 moduleStart[i] += moduleStart[1024];
0477 }
0478 __syncthreads();
0479
0480 if constexpr (isPhase2) {
0481 for (int i = first + 2049, iend = 3073; i < iend; i += blockDim.x) {
0482 moduleStart[i] += moduleStart[2048];
0483 }
0484 __syncthreads();
0485 for (int i = first + 3073, iend = nMaxModules + 1; i < iend; i += blockDim.x) {
0486 moduleStart[i] += moduleStart[3072];
0487 }
0488 __syncthreads();
0489 }
0490
0491 if (threadIdx.x == 0) {
0492 // copy the number of modules
0493 nModules_Clusters[0] = *nModules;
0494 // last element holds the number of all clusters
0495 nModules_Clusters[1] = moduleStart[nMaxModules];
0496 // element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1)
0497 nModules_Clusters[2] = moduleStart[startBPIX2];
0498 }
0499
0500 #ifdef GPU_DEBUG
0501 uint16_t maxH = isPhase2 ? 3027 : 1024;
0502 assert(0 == moduleStart[0]);
0503 auto c0 = std::min(gpuClustering::maxHitsInModule(), clusInModule[0]);
0504 assert(c0 == moduleStart[1]);
0505 assert(moduleStart[maxH] >= moduleStart[maxH - 1]);
0506 assert(moduleStart[maxH + 1] >= moduleStart[maxH]);
0507 assert(moduleStart[nMaxModules] >= moduleStart[maxH + 1]);
0508
0509 constexpr int startFP1 = TrackerTraits::numberOfModulesInBarrel;
0510 constexpr int startLastFwd = TrackerTraits::layerStart[TrackerTraits::numberOfLayers];
0511 for (int i = first, iend = nMaxModules + 1; i < iend; i += blockDim.x) {
0512 if (0 != i)
0513 assert(moduleStart[i] >= moduleStart[i - i]);
0514 // [BPX1, BPX2, BPX3, BPX4, FP1, FP2, FP3, FN1, FN2, FN3, LAST_VALID]
0515 // [ 0, 96, 320, 672, 1184, 1296, 1408, 1520, 1632, 1744, 1856]
0516 if (i == startBPIX2 || i == startFP1 || i == startLastFwd || i == nMaxModules)
0517 printf("moduleStart %d %d\n", i, moduleStart[i]);
0518 }
0519
0520 #endif
0521 }
0522
0523 // Interface to outside
0524 void SiPixelRawToClusterGPUKernel::makeClustersAsync(bool isRun2,
0525 const SiPixelClusterThresholds clusterThresholds,
0526 const SiPixelROCsStatusAndMapping *cablingMap,
0527 const unsigned char *modToUnp,
0528 const SiPixelGainForHLTonGPU *gains,
0529 const WordFedAppender &wordFed,
0530 SiPixelFormatterErrors &&errors,
0531 const uint32_t wordCounter,
0532 const uint32_t fedCounter,
0533 bool useQualityInfo,
0534 bool includeErrors,
0535 bool debug,
0536 cudaStream_t stream) {
0537 using pixelTopology::Phase1;
0538 // we're not opting for calling this function in case of early events
0539 assert(wordCounter != 0);
0540 nDigis = wordCounter;
0541
0542 #ifdef GPU_DEBUG
0543 std::cout << "decoding " << wordCounter << " digis." << std::endl;
0544 #endif
0545
0546 // since wordCounter != 0 we're not allocating 0 bytes,
0547 // digis_d = SiPixelDigisCUDA(wordCounter, stream);
0548 digis_d = SiPixelDigisCUDA(size_t(wordCounter), stream);
0549 if (includeErrors) {
0550 digiErrors_d = SiPixelDigiErrorsCUDA(wordCounter, std::move(errors), stream);
0551 }
0552 clusters_d = SiPixelClustersCUDA(Phase1::numberOfModules, stream);
0553
0554 // Begin Raw2Digi block
0555 {
0556 const int threadsPerBlock = 512;
0557 const int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock; // fill it all
0558
0559 assert(0 == wordCounter % 2);
0560 // wordCounter is the total no of words in each event to be trasfered on device
0561 auto word_d = cms::cuda::make_device_unique<uint32_t[]>(wordCounter, stream);
0562 auto fedId_d = cms::cuda::make_device_unique<uint8_t[]>(wordCounter, stream);
0563
0564 cudaCheck(
0565 cudaMemcpyAsync(word_d.get(), wordFed.word(), wordCounter * sizeof(uint32_t), cudaMemcpyDefault, stream));
0566 cudaCheck(cudaMemcpyAsync(
0567 fedId_d.get(), wordFed.fedId(), wordCounter * sizeof(uint8_t) / 2, cudaMemcpyDefault, stream));
0568
0569 // Launch rawToDigi kernel
0570 if (debug)
0571 RawToDigi_kernel<true><<<blocks, threadsPerBlock, 0, stream>>>( //
0572 cablingMap,
0573 modToUnp,
0574 wordCounter,
0575 word_d.get(),
0576 fedId_d.get(),
0577 digis_d.view(),
0578 digiErrors_d.error(), // returns nullptr if default-constructed
0579 useQualityInfo,
0580 includeErrors);
0581 else
0582 RawToDigi_kernel<false><<<blocks, threadsPerBlock, 0, stream>>>( //
0583 cablingMap,
0584 modToUnp,
0585 wordCounter,
0586 word_d.get(),
0587 fedId_d.get(),
0588 digis_d.view(),
0589 digiErrors_d.error(), // returns nullptr if default-constructed
0590 useQualityInfo,
0591 includeErrors);
0592 cudaCheck(cudaGetLastError());
0593 #ifdef GPU_DEBUG
0594 cudaCheck(cudaStreamSynchronize(stream));
0595 #endif
0596
0597 if (includeErrors) {
0598 digiErrors_d.copyErrorToHostAsync(stream);
0599 }
0600 }
0601 // End of Raw2Digi and passing data for clustering
0602
0603 {
0604 // clusterizer ...
0605 using namespace gpuClustering;
0606 int threadsPerBlock = 256;
0607 int blocks = (std::max(int(wordCounter), int(Phase1::numberOfModules)) + threadsPerBlock - 1) / threadsPerBlock;
0608
0609 if (isRun2)
0610 gpuCalibPixel::calibDigis<true><<<blocks, threadsPerBlock, 0, stream>>>(digis_d->moduleId(),
0611 digis_d->xx(),
0612 digis_d->yy(),
0613 digis_d->adc(),
0614 gains,
0615 wordCounter,
0616 clusters_d->moduleStart(),
0617 clusters_d->clusInModule(),
0618 clusters_d->clusModuleStart());
0619 else
0620 gpuCalibPixel::calibDigis<false><<<blocks, threadsPerBlock, 0, stream>>>(digis_d->moduleId(),
0621 digis_d->xx(),
0622 digis_d->yy(),
0623 digis_d->adc(),
0624 gains,
0625 wordCounter,
0626 clusters_d->moduleStart(),
0627 clusters_d->clusInModule(),
0628 clusters_d->clusModuleStart());
0629
0630 cudaCheck(cudaGetLastError());
0631 #ifdef GPU_DEBUG
0632 cudaCheck(cudaStreamSynchronize(stream));
0633 #endif
0634
0635 #ifdef GPU_DEBUG
0636 std::cout << "CUDA countModules kernel launch with " << blocks << " blocks of " << threadsPerBlock
0637 << " threads\n";
0638 #endif
0639
0640 countModules<Phase1><<<blocks, threadsPerBlock, 0, stream>>>(
0641 digis_d->moduleId(), clusters_d->moduleStart(), digis_d->clus(), wordCounter);
0642 cudaCheck(cudaGetLastError());
0643
0644 threadsPerBlock = 256 + 128; /// should be larger than 6000/16 aka (maxPixInModule/maxiter in the kernel)
0645 blocks = phase1PixelTopology::numberOfModules;
0646 #ifdef GPU_DEBUG
0647 std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
0648 #endif
0649
0650 findClus<Phase1><<<blocks, threadsPerBlock, 0, stream>>>(digis_d->rawIdArr(),
0651 digis_d->moduleId(),
0652 digis_d->xx(),
0653 digis_d->yy(),
0654 clusters_d->moduleStart(),
0655 clusters_d->clusInModule(),
0656 clusters_d->moduleId(),
0657 digis_d->clus(),
0658 wordCounter);
0659
0660 cudaCheck(cudaGetLastError());
0661 #ifdef GPU_DEBUG
0662 cudaCheck(cudaStreamSynchronize(stream));
0663 #endif
0664
0665 // apply charge cut
0666 clusterChargeCut<Phase1><<<blocks, threadsPerBlock, 0, stream>>>(clusterThresholds,
0667 digis_d->moduleId(),
0668 digis_d->adc(),
0669 clusters_d->moduleStart(),
0670 clusters_d->clusInModule(),
0671 clusters_d->moduleId(),
0672 digis_d->clus(),
0673 wordCounter);
0674
0675 cudaCheck(cudaGetLastError());
0676
0677 // count the module start indices already here (instead of
0678 // rechits) so that the number of clusters/hits can be made
0679 // available in the rechit producer without additional points of
0680 // synchronization/ExternalWork
0681 auto nModules_Clusters_d = cms::cuda::make_device_unique<uint32_t[]>(3, stream);
0682 // MUST be ONE block
0683 fillHitsModuleStart<Phase1><<<1, 1024, 0, stream>>>(clusters_d->clusInModule(),
0684 clusters_d->clusModuleStart(),
0685 clusters_d->moduleStart(),
0686 nModules_Clusters_d.get());
0687
0688 // copy to host
0689 nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(3, stream);
0690 cudaCheck(cudaMemcpyAsync(
0691 nModules_Clusters_h.get(), nModules_Clusters_d.get(), 3 * sizeof(uint32_t), cudaMemcpyDefault, stream));
0692
0693 #ifdef GPU_DEBUG
0694 cudaCheck(cudaStreamSynchronize(stream));
0695 #endif
0696
0697 } // end clusterizer scope
0698 }
0699
0700 void SiPixelRawToClusterGPUKernel::makePhase2ClustersAsync(const SiPixelClusterThresholds clusterThresholds,
0701 const uint16_t *moduleIds,
0702 const uint16_t *xDigis,
0703 const uint16_t *yDigis,
0704 const uint16_t *adcDigis,
0705 const uint32_t *packedData,
0706 const uint32_t *rawIds,
0707 const uint32_t numDigis,
0708 cudaStream_t stream) {
0709 using namespace gpuClustering;
0710 using pixelTopology::Phase2;
0711 nDigis = numDigis;
0712 digis_d = SiPixelDigisCUDA(numDigis, stream);
0713
0714 cudaCheck(cudaMemcpyAsync(digis_d->moduleId(), moduleIds, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
0715 cudaCheck(cudaMemcpyAsync(digis_d->xx(), xDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
0716 cudaCheck(cudaMemcpyAsync(digis_d->yy(), yDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
0717 cudaCheck(cudaMemcpyAsync(digis_d->adc(), adcDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
0718 cudaCheck(cudaMemcpyAsync(digis_d->pdigi(), packedData, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream));
0719 cudaCheck(cudaMemcpyAsync(digis_d->rawIdArr(), rawIds, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream));
0720
0721 clusters_d = SiPixelClustersCUDA(Phase2::numberOfModules, stream);
0722
0723 nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(2, stream);
0724
0725 int threadsPerBlock = 512;
0726 int blocks = (int(numDigis) + threadsPerBlock - 1) / threadsPerBlock;
0727
0728 gpuCalibPixel::calibDigisPhase2<<<blocks, threadsPerBlock, 0, stream>>>(digis_d->moduleId(),
0729 digis_d->adc(),
0730 numDigis,
0731 clusters_d->moduleStart(),
0732 clusters_d->clusInModule(),
0733 clusters_d->clusModuleStart());
0734
0735 cudaCheck(cudaGetLastError());
0736
0737 #ifdef GPU_DEBUG
0738 cudaCheck(cudaStreamSynchronize(stream));
0739 std::cout << "CUDA countModules kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
0740 #endif
0741
0742 countModules<Phase2><<<blocks, threadsPerBlock, 0, stream>>>(
0743 digis_d->moduleId(), clusters_d->moduleStart(), digis_d->clus(), numDigis);
0744 cudaCheck(cudaGetLastError());
0745
0746 // read the number of modules into a data member, used by getProduct())
0747 cudaCheck(cudaMemcpyAsync(
0748 &(nModules_Clusters_h[0]), clusters_d->moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream));
0749
0750 threadsPerBlock = 256;
0751 blocks = Phase2::numberOfModules;
0752
0753 #ifdef GPU_DEBUG
0754 cudaCheck(cudaStreamSynchronize(stream));
0755 std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
0756 #endif
0757 findClus<Phase2><<<blocks, threadsPerBlock, 0, stream>>>(digis_d->rawIdArr(),
0758 digis_d->moduleId(),
0759 digis_d->xx(),
0760 digis_d->yy(),
0761 clusters_d->moduleStart(),
0762 clusters_d->clusInModule(),
0763 clusters_d->moduleId(),
0764 digis_d->clus(),
0765 numDigis);
0766
0767 cudaCheck(cudaGetLastError());
0768 #ifdef GPU_DEBUG
0769 cudaCheck(cudaStreamSynchronize(stream));
0770 std::cout << "CUDA clusterChargeCut kernel launch with " << blocks << " blocks of " << threadsPerBlock
0771 << " threads\n";
0772 #endif
0773
0774 // apply charge cut
0775 clusterChargeCut<Phase2><<<blocks, threadsPerBlock, 0, stream>>>(clusterThresholds,
0776 digis_d->moduleId(),
0777 digis_d->adc(),
0778 clusters_d->moduleStart(),
0779 clusters_d->clusInModule(),
0780 clusters_d->moduleId(),
0781 digis_d->clus(),
0782 numDigis);
0783 cudaCheck(cudaGetLastError());
0784
0785 auto nModules_Clusters_d = cms::cuda::make_device_unique<uint32_t[]>(3, stream);
0786 // MUST be ONE block
0787
0788 #ifdef GPU_DEBUG
0789 cudaCheck(cudaStreamSynchronize(stream));
0790 std::cout << "CUDA fillHitsModuleStart kernel launch \n";
0791 #endif
0792
0793 fillHitsModuleStart<Phase2><<<1, 1024, 0, stream>>>(clusters_d->clusInModule(),
0794 clusters_d->clusModuleStart(),
0795 clusters_d->moduleStart(),
0796 nModules_Clusters_d.get());
0797
0798 nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(3, stream);
0799 cudaCheck(cudaMemcpyAsync(
0800 nModules_Clusters_h.get(), nModules_Clusters_d.get(), 3 * sizeof(uint32_t), cudaMemcpyDefault, stream));
0801
0802 #ifdef GPU_DEBUG
0803 cudaCheck(cudaStreamSynchronize(stream));
0804 #endif
0805 } //
0806 } // namespace pixelgpudetails