SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu

0001 /* Sushil Dubey, Shashi Dugad, TIFR, July 2017
0002  *
0003  * File Name: RawToClusterGPU.cu
0004  * Description: It converts Raw data into Digi Format on GPU
0005  * Finaly the Output of RawToDigi data is given to pixelClusterizer
0006 **/
0007
0008 // C++ includes
0009 #include <cassert>
0010 #include <cstdio>
0011 #include <cstdlib>
0012 #include <cstring>
0013 #include <fstream>
0014 #include <iomanip>
0015 #include <iostream>
0016
0017 // CUDA includes
0018 #include <cuda_runtime.h>
0019
0020 // CMSSW includes
0021 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
0022 #include "CondFormats/SiPixelObjects/interface/SiPixelROCsStatusAndMapping.h"
0023 #include "DataFormats/FEDRawData/interface/FEDNumbering.h"
0024 #include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
0025 #include "DataFormats/SiPixelDigi/interface/SiPixelDigiConstants.h"
0026 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0027 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0028 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
0029
0030 // local includes
0031 #include "SiPixelRawToClusterGPUKernel.h"
0032 #include "gpuCalibPixel.h"
0033 #include "gpuClusterChargeCut.h"
0034 #include "gpuClustering.h"
0035
0036 //#define GPU_DEBUG
0037
0038 namespace pixelgpudetails {
0039
0040   __device__ bool isBarrel(uint32_t rawId) {
0041     return (PixelSubdetector::PixelBarrel == ((rawId >> DetId::kSubdetOffset) & DetId::kSubdetMask));
0042   }
0043
0044   __device__ pixelgpudetails::DetIdGPU getRawId(const SiPixelROCsStatusAndMapping *cablingMap,
0045                                                 uint8_t fed,
0046                                                 uint32_t link,
0047                                                 uint32_t roc) {
0048     uint32_t index = fed * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + roc;
0049     pixelgpudetails::DetIdGPU detId = {
0050         cablingMap->rawId[index], cablingMap->rocInDet[index], cablingMap->moduleId[index]};
0051     return detId;
0052   }
0053
0054   //reference http://cmsdoxygen.web.cern.ch/cmsdoxygen/CMSSW_9_2_0/doc/html/dd/d31/FrameConversion_8cc_source.html
0055   //http://cmslxr.fnal.gov/source/CondFormats/SiPixelObjects/src/PixelROC.cc?v=CMSSW_9_2_0#0071
0056   // Convert local pixel to pixelgpudetails::global pixel
0057   __device__ pixelgpudetails::Pixel frameConversion(
0058       bool bpix, int side, uint32_t layer, uint32_t rocIdInDetUnit, pixelgpudetails::Pixel local) {
0059     int slopeRow = 0, slopeCol = 0;
0060     int rowOffset = 0, colOffset = 0;
0061
0062     if (bpix) {
0063       if (side == -1 && layer != 1) {  // -Z side: 4 non-flipped modules oriented like 'dddd', except Layer 1
0064         if (rocIdInDetUnit < 8) {
0065           slopeRow = 1;
0066           slopeCol = -1;
0067           rowOffset = 0;
0068           colOffset = (8 - rocIdInDetUnit) * pixelgpudetails::numColsInRoc - 1;
0069         } else {
0070           slopeRow = -1;
0071           slopeCol = 1;
0072           rowOffset = 2 * pixelgpudetails::numRowsInRoc - 1;
0073           colOffset = (rocIdInDetUnit - 8) * pixelgpudetails::numColsInRoc;
0074         }       // if roc
0075       } else {  // +Z side: 4 non-flipped modules oriented like 'pppp', but all 8 in layer1
0076         if (rocIdInDetUnit < 8) {
0077           slopeRow = -1;
0078           slopeCol = 1;
0079           rowOffset = 2 * pixelgpudetails::numRowsInRoc - 1;
0080           colOffset = rocIdInDetUnit * pixelgpudetails::numColsInRoc;
0081         } else {
0082           slopeRow = 1;
0083           slopeCol = -1;
0084           rowOffset = 0;
0085           colOffset = (16 - rocIdInDetUnit) * pixelgpudetails::numColsInRoc - 1;
0086         }
0087       }
0088
0089     } else {             // fpix
0090       if (side == -1) {  // pannel 1
0091         if (rocIdInDetUnit < 8) {
0092           slopeRow = 1;
0093           slopeCol = -1;
0094           rowOffset = 0;
0095           colOffset = (8 - rocIdInDetUnit) * pixelgpudetails::numColsInRoc - 1;
0096         } else {
0097           slopeRow = -1;
0098           slopeCol = 1;
0099           rowOffset = 2 * pixelgpudetails::numRowsInRoc - 1;
0100           colOffset = (rocIdInDetUnit - 8) * pixelgpudetails::numColsInRoc;
0101         }
0102       } else {  // pannel 2
0103         if (rocIdInDetUnit < 8) {
0104           slopeRow = 1;
0105           slopeCol = -1;
0106           rowOffset = 0;
0107           colOffset = (8 - rocIdInDetUnit) * pixelgpudetails::numColsInRoc - 1;
0108         } else {
0109           slopeRow = -1;
0110           slopeCol = 1;
0111           rowOffset = 2 * pixelgpudetails::numRowsInRoc - 1;
0112           colOffset = (rocIdInDetUnit - 8) * pixelgpudetails::numColsInRoc;
0113         }
0114
0115       }  // side
0116     }
0117
0118     uint32_t gRow = rowOffset + slopeRow * local.row;
0119     uint32_t gCol = colOffset + slopeCol * local.col;
0120     // inside frameConversion row: gRow, column: gCol
0121     pixelgpudetails::Pixel global = {gRow, gCol};
0122     return global;
0123   }
0124
0125   // error decoding and handling copied from EventFilter/SiPixelRawToDigi/src/ErrorChecker.cc
0126   template <bool debug = false>
0127   __device__ uint8_t conversionError(uint8_t fedId, uint8_t status) {
0128     uint8_t errorType = 0;
0129
0130     switch (status) {
0131       case (1): {
0132         if constexpr (debug)
0133           printf("Error in Fed: %i, invalid channel Id (errorType = 35\n)", fedId);
0134         errorType = 35;
0135         break;
0136       }
0137       case (2): {
0138         if constexpr (debug)
0139           printf("Error in Fed: %i, invalid ROC Id (errorType = 36)\n", fedId);
0140         errorType = 36;
0141         break;
0142       }
0143       case (3): {
0144         if constexpr (debug)
0145           printf("Error in Fed: %i, invalid dcol/pixel value (errorType = 37)\n", fedId);
0146         errorType = 37;
0147         break;
0148       }
0149       case (4): {
0150         if constexpr (debug)
0151           printf("Error in Fed: %i, dcol/pixel read out of order (errorType = 38)\n", fedId);
0152         errorType = 38;
0153         break;
0154       }
0155       default:
0156         if constexpr (debug)
0157           printf("Cabling check returned unexpected result, status = %i\n", status);
0158     };
0159
0160     return errorType;
0161   }
0162
0163   __device__ bool rocRowColIsValid(uint32_t rocRow, uint32_t rocCol) {
0164     /// row and column in ROC representation
0165     return ((rocRow < pixelgpudetails::numRowsInRoc) & (rocCol < pixelgpudetails::numColsInRoc));
0166   }
0167
0168   __device__ bool dcolIsValid(uint32_t dcol, uint32_t pxid) { return ((dcol < 26) & (2 <= pxid) & (pxid < 162)); }
0169
0170   // error decoding and handling copied from EventFilter/SiPixelRawToDigi/src/ErrorChecker.cc
0171   template <bool debug = false>
0172   __device__ uint8_t
0173   checkROC(uint32_t errorWord, uint8_t fedId, uint32_t link, const SiPixelROCsStatusAndMapping *cablingMap) {
0174     uint8_t errorType = (errorWord >> sipixelconstants::ROC_shift) & sipixelconstants::ERROR_mask;
0175     if (errorType < 25)
0176       return 0;
0177     bool errorFound = false;
0178
0179     switch (errorType) {
0180       case (25): {
0181         errorFound = true;
0182         uint32_t index = fedId * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + 1;
0183         if (index > 1 && index <= cablingMap->size) {
0184           if (!(link == cablingMap->link[index] && 1 == cablingMap->roc[index]))
0185             errorFound = false;
0186         }
0187         if constexpr (debug)
0188           if (errorFound)
0189             printf("Invalid ROC = 25 found (errorType = 25)\n");
0190         break;
0191       }
0192       case (26): {
0193         if constexpr (debug)
0194           printf("Gap word found (errorType = 26)\n");
0195         break;
0196       }
0197       case (27): {
0198         if constexpr (debug)
0199           printf("Dummy word found (errorType = 27)\n");
0200         break;
0201       }
0202       case (28): {
0203         if constexpr (debug)
0204           printf("Error fifo nearly full (errorType = 28)\n");
0205         errorFound = true;
0206         break;
0207       }
0208       case (29): {
0209         if constexpr (debug)
0210           printf("Timeout on a channel (errorType = 29)\n");
0211         if (!((errorWord >> sipixelconstants::OMIT_ERR_shift) & sipixelconstants::OMIT_ERR_mask)) {
0212           if constexpr (debug)
0213             printf("...2nd errorType=29 error, skip\n");
0214           break;
0215         }
0216         errorFound = true;
0217         break;
0218       }
0219       case (30): {
0220         if constexpr (debug)
0221           printf("TBM error trailer (errorType = 30)\n");
0222         int stateMatch_bits = 4;
0223         int stateMatch_shift = 8;
0224         uint32_t stateMatch_mask = ~(~uint32_t(0) << stateMatch_bits);
0225         int stateMatch = (errorWord >> stateMatch_shift) & stateMatch_mask;
0226         if (stateMatch != 1 && stateMatch != 8) {
0227           if constexpr (debug)
0228             printf("FED error 30 with unexpected State Bits (errorType = 30)\n");
0229           break;
0230         }
0231         if (stateMatch == 1)
0232           errorType = 40;  // 1=Overflow -> 40, 8=number of ROCs -> 30
0233         errorFound = true;
0234         break;
0235       }
0236       case (31): {
0237         if constexpr (debug)
0238           printf("Event number error (errorType = 31)\n");
0239         errorFound = true;
0240         break;
0241       }
0242       default:
0243         errorFound = false;
0244     };
0245
0246     return errorFound ? errorType : 0;
0247   }
0248
0249   // error decoding and handling copied from EventFilter/SiPixelRawToDigi/src/ErrorChecker.cc
0250   template <bool debug = false>
0251   __device__ uint32_t
0252   getErrRawID(uint8_t fedId, uint32_t errWord, uint32_t errorType, const SiPixelROCsStatusAndMapping *cablingMap) {
0253     uint32_t rID = 0xffffffff;
0254
0255     switch (errorType) {
0256       case 25:
0257       case 29:
0258       case 30:
0259       case 31:
0260       case 36:
0261       case 40: {
0262         uint32_t roc = 1;
0263         uint32_t link = sipixelconstants::getLink(errWord);
0264         uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
0265         if (rID_temp != gpuClustering::invalidModuleId)
0266           rID = rID_temp;
0267         break;
0268       }
0269       case 37:
0270       case 38: {
0271         uint32_t roc = sipixelconstants::getROC(errWord);
0272         uint32_t link = sipixelconstants::getLink(errWord);
0273         uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
0274         if (rID_temp != gpuClustering::invalidModuleId)
0275           rID = rID_temp;
0276         break;
0277       }
0278       default:
0279         break;
0280     };
0281
0282     return rID;
0283   }
0284
0285   // Kernel to perform Raw to Digi conversion
0286   template <bool debug = false>
0287   __global__ void RawToDigi_kernel(const SiPixelROCsStatusAndMapping *cablingMap,
0288                                    const unsigned char *modToUnp,
0289                                    const uint32_t wordCounter,
0290                                    const uint32_t *word,
0291                                    const uint8_t *fedIds,
0292                                    SiPixelDigisSoA::View digisView,
0293                                    cms::cuda::SimpleVector<SiPixelErrorCompact> *err,
0294                                    bool useQualityInfo,
0295                                    bool includeErrors) {
0296     //if (threadIdx.x==0) printf("Event: %u blockIdx.x: %u start: %u end: %u\n", eventno, blockIdx.x, begin, end);
0297
0298     int32_t first = threadIdx.x + blockIdx.x * blockDim.x;
0299     for (int32_t iloop = first, nend = wordCounter; iloop < nend; iloop += blockDim.x * gridDim.x) {
0300       auto gIndex = iloop;
0301       auto dvgi = digisView[gIndex];
0302       dvgi.xx() = 0;
0303       dvgi.yy() = 0;
0304       dvgi.adc() = 0;
0305       bool skipROC = false;
0306
0307       uint8_t fedId = fedIds[gIndex / 2];  // +1200;
0308
0309       // initialize (too many coninue below)
0310       dvgi.pdigi() = 0;
0311       dvgi.rawIdArr() = 0;
0312       dvgi.moduleId() = gpuClustering::invalidModuleId;
0313
0314       uint32_t ww = word[gIndex];  // Array containing 32 bit raw data
0315       if (ww == 0) {
0316         // 0 is an indicator of a noise/dead channel, skip these pixels during clusterization
0317         continue;
0318       }
0319
0320       uint32_t link = sipixelconstants::getLink(ww);  // Extract link
0321       uint32_t roc = sipixelconstants::getROC(ww);    // Extract ROC in link
0322
0323       uint8_t errorType = checkROC<debug>(ww, fedId, link, cablingMap);
0324       skipROC = (roc < pixelgpudetails::maxROCIndex) ? false : (errorType != 0);
0325       if (includeErrors and skipROC) {
0326         uint32_t rID = getErrRawID<debug>(fedId, ww, errorType, cablingMap);
0327         if (rID != 0xffffffff)  // store errors only for valid DetIds
0328           err->push_back(SiPixelErrorCompact{rID, ww, errorType, fedId});
0329         continue;
0330       }
0331
0332       // check for spurious channels
0333       if (roc > MAX_ROC or link > MAX_LINK) {
0334         uint32_t rawId = getRawId(cablingMap, fedId, link, 1).rawId;
0335         if constexpr (debug) {
0336           printf("spurious roc %d found on link %d, detector %d (index %d)\n", roc, link, rawId, gIndex);
0337         }
0338         if (roc > MAX_ROC and roc < 25) {
0339           uint8_t error = conversionError<debug>(fedId, 2);
0340           err->push_back(SiPixelErrorCompact{rawId, ww, error, fedId});
0341         }
0342         continue;
0343       }
0344
0345       uint32_t index = fedId * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + roc;
0346       if (useQualityInfo) {
0347         skipROC = cablingMap->badRocs[index];
0348         if (skipROC)
0349           continue;
0350       }
0351       skipROC = modToUnp[index];
0352       if (skipROC)
0353         continue;
0354
0355       pixelgpudetails::DetIdGPU detId = getRawId(cablingMap, fedId, link, roc);
0356       uint32_t rawId = detId.rawId;
0357       uint32_t layer = 0;
0358       int side = 0, panel = 0, module = 0;
0359       bool barrel = isBarrel(rawId);
0360       if (barrel) {
0361         layer = (rawId >> pixelgpudetails::layerStartBit) & pixelgpudetails::layerMask;
0362         module = (rawId >> pixelgpudetails::moduleStartBit) & pixelgpudetails::moduleMask;
0363         side = (module < 5) ? -1 : 1;
0364       } else {
0365         // endcap ids
0366         layer = 0;
0367         panel = (rawId >> pixelgpudetails::panelStartBit) & pixelgpudetails::panelMask;
0368         side = (panel == 1) ? -1 : 1;
0369       }
0370
0371       // ***special case of layer to 1 be handled here
0372       pixelgpudetails::Pixel localPix;
0373       if (layer == 1) {
0374         uint32_t col = sipixelconstants::getCol(ww);
0375         uint32_t row = sipixelconstants::getRow(ww);
0376         localPix.row = row;
0377         localPix.col = col;
0378         if (includeErrors) {
0379           if (not rocRowColIsValid(row, col)) {
0380             uint8_t error = conversionError<debug>(fedId, 3);  //use the device function and fill the arrays
0381             err->push_back(SiPixelErrorCompact{rawId, ww, error, fedId});
0382             if constexpr (debug)
0383               printf("BPIX1  Error status: %i\n", error);
0384             continue;
0385           }
0386         }
0387       } else {
0388         // ***conversion rules for dcol and pxid
0389         uint32_t dcol = sipixelconstants::getDCol(ww);
0390         uint32_t pxid = sipixelconstants::getPxId(ww);
0391         uint32_t row = pixelgpudetails::numRowsInRoc - pxid / 2;
0392         uint32_t col = dcol * 2 + pxid % 2;
0393         localPix.row = row;
0394         localPix.col = col;
0395         if (includeErrors and not dcolIsValid(dcol, pxid)) {
0396           uint8_t error = conversionError<debug>(fedId, 3);
0397           err->push_back(SiPixelErrorCompact{rawId, ww, error, fedId});
0398           if constexpr (debug)
0399             printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc);
0400           continue;
0401         }
0402       }
0403
0404       pixelgpudetails::Pixel globalPix = frameConversion(barrel, side, layer, detId.rocInDet, localPix);
0405       dvgi.xx() = globalPix.row;  // origin shifting by 1 0-159
0406       dvgi.yy() = globalPix.col;  // origin shifting by 1 0-415
0407       dvgi.adc() = sipixelconstants::getADC(ww);
0408       dvgi.pdigi() = pixelgpudetails::pack(globalPix.row, globalPix.col, dvgi.adc());
0409       dvgi.moduleId() = detId.moduleId;
0410       dvgi.rawIdArr() = rawId;
0411     }  // end of loop (gIndex < end)
0412
0413   }  // end of Raw to Digi kernel
0414
0415   template <typename TrackerTraits>
0416   __global__ void fillHitsModuleStart(uint32_t const *__restrict__ clusInModule,
0417                                       uint32_t *__restrict__ moduleStart,
0418                                       uint32_t const *__restrict__ nModules,
0419                                       uint32_t *__restrict__ nModules_Clusters) {
0420     constexpr int nMaxModules = TrackerTraits::numberOfModules;
0421     constexpr int startBPIX2 = TrackerTraits::layerStart[1];
0422
0423     constexpr uint32_t maxHitsInModule = TrackerTraits::maxHitsInModule;
0424
0425     assert(startBPIX2 < nMaxModules);
0426     assert(nMaxModules < 4096);  // easy to extend at least till 32*1024
0427     assert(nMaxModules > 1024);
0428
0429     assert(1 == gridDim.x);
0430     assert(0 == blockIdx.x);
0431
0432     int first = threadIdx.x;
0433
0434     // limit to MaxHitsInModule;
0435     for (int i = first, iend = nMaxModules; i < iend; i += blockDim.x) {
0436       moduleStart[i + 1] = std::min(maxHitsInModule, clusInModule[i]);
0437     }
0438
0439     constexpr bool isPhase2 = std::is_base_of<pixelTopology::Phase2, TrackerTraits>::value;
0440     __shared__ uint32_t ws[32];
0441     cms::cuda::blockPrefixScan(moduleStart + 1, moduleStart + 1, 1024, ws);
0442     constexpr int lastModules = isPhase2 ? 1024 : nMaxModules - 1024;
0443     cms::cuda::blockPrefixScan(moduleStart + 1024 + 1, moduleStart + 1024 + 1, lastModules, ws);
0444
0445     if constexpr (isPhase2) {
0446       cms::cuda::blockPrefixScan(moduleStart + 2048 + 1, moduleStart + 2048 + 1, 1024, ws);
0447       cms::cuda::blockPrefixScan(moduleStart + 3072 + 1, moduleStart + 3072 + 1, nMaxModules - 3072, ws);
0448     }
0449
0450     for (int i = first + 1025, iend = isPhase2 ? 2049 : nMaxModules + 1; i < iend; i += blockDim.x) {
0451       moduleStart[i] += moduleStart[1024];
0452     }
0453     __syncthreads();
0454
0455     if constexpr (isPhase2) {
0456       for (int i = first + 2049, iend = 3073; i < iend; i += blockDim.x) {
0457         moduleStart[i] += moduleStart[2048];
0458       }
0459       __syncthreads();
0460       for (int i = first + 3073, iend = nMaxModules + 1; i < iend; i += blockDim.x) {
0461         moduleStart[i] += moduleStart[3072];
0462       }
0463       __syncthreads();
0464     }
0465
0466     if (threadIdx.x == 0) {
0467       // copy the number of modules
0468       nModules_Clusters[0] = *nModules;
0469       // last element holds the number of all clusters
0470       nModules_Clusters[1] = moduleStart[nMaxModules];
0471       // element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1)
0472       nModules_Clusters[2] = moduleStart[startBPIX2];
0473     }
0474
0475 #ifdef GPU_DEBUG
0476     uint16_t maxH = isPhase2 ? 3027 : 1024;
0477     assert(0 == moduleStart[0]);
0478     auto c0 = std::min(maxHitsInModule, clusInModule[0]);
0479     assert(c0 == moduleStart[1]);
0480     assert(moduleStart[maxH] >= moduleStart[maxH - 1]);
0481     assert(moduleStart[maxH + 1] >= moduleStart[maxH]);
0482     assert(moduleStart[nMaxModules] >= moduleStart[maxH + 1]);
0483
0484     constexpr int startFP1 = TrackerTraits::numberOfModulesInBarrel;
0485     constexpr int startLastFwd = TrackerTraits::layerStart[TrackerTraits::numberOfLayers];
0486     for (int i = first, iend = nMaxModules + 1; i < iend; i += blockDim.x) {
0487       if (0 != i)
0488         assert(moduleStart[i] >= moduleStart[i - i]);
0489       // [BPX1, BPX2, BPX3, BPX4,  FP1,  FP2,  FP3,  FN1,  FN2,  FN3, LAST_VALID]
0490       // [   0,   96,  320,  672, 1184, 1296, 1408, 1520, 1632, 1744,       1856]
0491       if (i == startBPIX2 || i == startFP1 || i == startLastFwd || i == nMaxModules)
0492         printf("moduleStart %d %d\n", i, moduleStart[i]);
0493     }
0494
0495 #endif
0496   }
0497
0498   // Interface to outside
0499   template <typename TrackerTraits>
0500   void SiPixelRawToClusterGPUKernel<TrackerTraits>::makePhase1ClustersAsync(
0501       const SiPixelClusterThresholds clusterThresholds,
0502       const SiPixelROCsStatusAndMapping *cablingMap,
0503       const unsigned char *modToUnp,
0504       const SiPixelGainForHLTonGPU *gains,
0505       const WordFedAppender &wordFed,
0506       SiPixelFormatterErrors &&errors,
0507       const uint32_t wordCounter,
0508       const uint32_t fedCounter,
0509       bool useQualityInfo,
0510       bool includeErrors,
0511       bool debug,
0512       cudaStream_t stream) {
0513     // we're not opting for calling this function in case of early events
0514     assert(wordCounter != 0);
0515     nDigis = wordCounter;
0516
0517 #ifdef GPU_DEBUG
0518     std::cout << "decoding " << wordCounter << " digis." << std::endl;
0519 #endif
0520
0521     // since wordCounter != 0 we're not allocating 0 bytes,
0522     // digis_d = SiPixelDigisCUDA(wordCounter, stream);
0523     digis_d = SiPixelDigisCUDA(size_t(wordCounter), stream);
0524     if (includeErrors) {
0525       digiErrors_d = SiPixelDigiErrorsCUDA(wordCounter, std::move(errors), stream);
0526     }
0527     clusters_d = SiPixelClustersCUDA(TrackerTraits::numberOfModules, stream);
0528
0529     // Begin Raw2Digi block
0530     {
0531       const int threadsPerBlock = 512;
0532       const int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock;  // fill it all
0533
0534       assert(0 == wordCounter % 2);
0535       // wordCounter is the total no of words in each event to be trasfered on device
0536       auto word_d = cms::cuda::make_device_unique<uint32_t[]>(wordCounter, stream);
0537       auto fedId_d = cms::cuda::make_device_unique<uint8_t[]>(wordCounter, stream);
0538
0539       cudaCheck(
0540           cudaMemcpyAsync(word_d.get(), wordFed.word(), wordCounter * sizeof(uint32_t), cudaMemcpyDefault, stream));
0541       cudaCheck(cudaMemcpyAsync(
0542           fedId_d.get(), wordFed.fedId(), wordCounter * sizeof(uint8_t) / 2, cudaMemcpyDefault, stream));
0543
0544       // Launch rawToDigi kernel
0545       if (debug)
0546         RawToDigi_kernel<true><<<blocks, threadsPerBlock, 0, stream>>>(  //
0547             cablingMap,
0548             modToUnp,
0549             wordCounter,
0550             word_d.get(),
0551             fedId_d.get(),
0552             digis_d.view(),
0553             digiErrors_d.error(),  // returns nullptr if default-constructed
0554             useQualityInfo,
0555             includeErrors);
0556       else
0557         RawToDigi_kernel<false><<<blocks, threadsPerBlock, 0, stream>>>(  //
0558             cablingMap,
0559             modToUnp,
0560             wordCounter,
0561             word_d.get(),
0562             fedId_d.get(),
0563             digis_d.view(),
0564             digiErrors_d.error(),  // returns nullptr if default-constructed
0565             useQualityInfo,
0566             includeErrors);
0567       cudaCheck(cudaGetLastError());
0568 #ifdef GPU_DEBUG
0569       cudaCheck(cudaStreamSynchronize(stream));
0570 #endif
0571
0572       if (includeErrors) {
0573         digiErrors_d.copyErrorToHostAsync(stream);
0574       }
0575     }
0576     // End of Raw2Digi and passing data for clustering
0577
0578     {
0579       // clusterizer ...
0580       using namespace gpuClustering;
0581       int threadsPerBlock = 256;
0582       int blocks =
0583           (std::max(int(wordCounter), int(TrackerTraits::numberOfModules)) + threadsPerBlock - 1) / threadsPerBlock;
0584
0585       gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, stream>>>(clusterThresholds,
0586                                                                         digis_d.view().moduleId(),
0587                                                                         digis_d.view().xx(),
0588                                                                         digis_d.view().yy(),
0589                                                                         digis_d.view().adc(),
0590                                                                         gains,
0591                                                                         wordCounter,
0592                                                                         clusters_d->moduleStart(),
0593                                                                         clusters_d->clusInModule(),
0594                                                                         clusters_d->clusModuleStart());
0595
0596       cudaCheck(cudaGetLastError());
0597 #ifdef GPU_DEBUG
0598       cudaCheck(cudaStreamSynchronize(stream));
0599 #endif
0600
0601 #ifdef GPU_DEBUG
0602       std::cout << "CUDA countModules kernel launch with " << blocks << " blocks of " << threadsPerBlock
0603                 << " threads\n";
0604 #endif
0605
0606       countModules<TrackerTraits><<<blocks, threadsPerBlock, 0, stream>>>(
0607           digis_d->moduleId(), clusters_d->moduleStart(), digis_d->clus(), wordCounter);
0608       cudaCheck(cudaGetLastError());
0609
0610       // should be larger than maxPixInModule/16 aka (maxPixInModule/maxiter in the kernel)
0611       threadsPerBlock = ((TrackerTraits::maxPixInModule / 16 + 128 - 1) / 128) * 128;
0612       blocks = TrackerTraits::numberOfModules;
0613 #ifdef GPU_DEBUG
0614       std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
0615 #endif
0616
0617       findClus<TrackerTraits><<<blocks, threadsPerBlock, 0, stream>>>(digis_d->rawIdArr(),
0618                                                                       digis_d->moduleId(),
0619                                                                       digis_d->xx(),
0620                                                                       digis_d->yy(),
0621                                                                       clusters_d->moduleStart(),
0622                                                                       clusters_d->clusInModule(),
0623                                                                       clusters_d->moduleId(),
0624                                                                       digis_d->clus(),
0625                                                                       wordCounter);
0626
0627       cudaCheck(cudaGetLastError());
0628 #ifdef GPU_DEBUG
0629       cudaCheck(cudaStreamSynchronize(stream));
0630 #endif
0631
0632       // apply charge cut
0633       clusterChargeCut<TrackerTraits><<<blocks, threadsPerBlock, 0, stream>>>(clusterThresholds,
0634                                                                               digis_d->moduleId(),
0635                                                                               digis_d->adc(),
0636                                                                               clusters_d->moduleStart(),
0637                                                                               clusters_d->clusInModule(),
0638                                                                               clusters_d->moduleId(),
0639                                                                               digis_d->clus(),
0640                                                                               wordCounter);
0641
0642       cudaCheck(cudaGetLastError());
0643
0644       // count the module start indices already here (instead of
0645       // rechits) so that the number of clusters/hits can be made
0646       // available in the rechit producer without additional points of
0647       // synchronization/ExternalWork
0648       auto nModules_Clusters_d = cms::cuda::make_device_unique<uint32_t[]>(3, stream);
0649       // MUST be ONE block
0650       fillHitsModuleStart<TrackerTraits><<<1, 1024, 0, stream>>>(clusters_d->clusInModule(),
0651                                                                  clusters_d->clusModuleStart(),
0652                                                                  clusters_d->moduleStart(),
0653                                                                  nModules_Clusters_d.get());
0654
0655       // copy to host
0656       nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(3, stream);
0657       cudaCheck(cudaMemcpyAsync(
0658           nModules_Clusters_h.get(), nModules_Clusters_d.get(), 3 * sizeof(uint32_t), cudaMemcpyDefault, stream));
0659
0660 #ifdef GPU_DEBUG
0661       cudaCheck(cudaStreamSynchronize(stream));
0662 #endif
0663
0664     }  // end clusterizer scope
0665   }
0666
0667   template <typename TrackerTraits>
0668   void SiPixelRawToClusterGPUKernel<TrackerTraits>::makePhase2ClustersAsync(
0669       const SiPixelClusterThresholds clusterThresholds,
0670       const uint16_t *moduleIds,
0671       const uint16_t *xDigis,
0672       const uint16_t *yDigis,
0673       const uint16_t *adcDigis,
0674       const uint32_t *packedData,
0675       const uint32_t *rawIds,
0676       const uint32_t numDigis,
0677       cudaStream_t stream) {
0678     using namespace gpuClustering;
0679     nDigis = numDigis;
0680     digis_d = SiPixelDigisCUDA(numDigis, stream);
0681
0682     cudaCheck(cudaMemcpyAsync(digis_d->moduleId(), moduleIds, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
0683     cudaCheck(cudaMemcpyAsync(digis_d->xx(), xDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
0684     cudaCheck(cudaMemcpyAsync(digis_d->yy(), yDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
0685     cudaCheck(cudaMemcpyAsync(digis_d->adc(), adcDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
0686     cudaCheck(cudaMemcpyAsync(digis_d->pdigi(), packedData, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream));
0687     cudaCheck(cudaMemcpyAsync(digis_d->rawIdArr(), rawIds, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream));
0688
0689     clusters_d = SiPixelClustersCUDA(TrackerTraits::numberOfModules, stream);
0690
0691     nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(2, stream);
0692
0693     int threadsPerBlock = 512;
0694     int blocks = (int(numDigis) + threadsPerBlock - 1) / threadsPerBlock;
0695
0696     gpuCalibPixel::calibDigisPhase2<<<blocks, threadsPerBlock, 0, stream>>>(clusterThresholds,
0697                                                                             digis_d->moduleId(),
0698                                                                             digis_d->adc(),
0699                                                                             numDigis,
0700                                                                             clusters_d->moduleStart(),
0701                                                                             clusters_d->clusInModule(),
0702                                                                             clusters_d->clusModuleStart());
0703
0704     cudaCheck(cudaGetLastError());
0705
0706 #ifdef GPU_DEBUG
0707     cudaCheck(cudaStreamSynchronize(stream));
0708     std::cout << "CUDA countModules kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
0709 #endif
0710
0711     countModules<TrackerTraits><<<blocks, threadsPerBlock, 0, stream>>>(
0712         digis_d->moduleId(), clusters_d->moduleStart(), digis_d->clus(), numDigis);
0713     cudaCheck(cudaGetLastError());
0714
0715     // read the number of modules into a data member, used by getProduct())
0716     cudaCheck(cudaMemcpyAsync(
0717         &(nModules_Clusters_h[0]), clusters_d->moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream));
0718
0719     threadsPerBlock = 256;
0720     blocks = TrackerTraits::numberOfModules;
0721
0722 #ifdef GPU_DEBUG
0723     cudaCheck(cudaStreamSynchronize(stream));
0724     std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
0725 #endif
0726     findClus<TrackerTraits><<<blocks, threadsPerBlock, 0, stream>>>(digis_d->rawIdArr(),
0727                                                                     digis_d->moduleId(),
0728                                                                     digis_d->xx(),
0729                                                                     digis_d->yy(),
0730                                                                     clusters_d->moduleStart(),
0731                                                                     clusters_d->clusInModule(),
0732                                                                     clusters_d->moduleId(),
0733                                                                     digis_d->clus(),
0734                                                                     numDigis);
0735
0736     cudaCheck(cudaGetLastError());
0737 #ifdef GPU_DEBUG
0738     cudaCheck(cudaStreamSynchronize(stream));
0739     std::cout << "CUDA clusterChargeCut kernel launch with " << blocks << " blocks of " << threadsPerBlock
0740               << " threads\n";
0741 #endif
0742
0743     // apply charge cut
0744     clusterChargeCut<TrackerTraits><<<blocks, threadsPerBlock, 0, stream>>>(clusterThresholds,
0745                                                                             digis_d->moduleId(),
0746                                                                             digis_d->adc(),
0747                                                                             clusters_d->moduleStart(),
0748                                                                             clusters_d->clusInModule(),
0749                                                                             clusters_d->moduleId(),
0750                                                                             digis_d->clus(),
0751                                                                             numDigis);
0752     cudaCheck(cudaGetLastError());
0753
0754     auto nModules_Clusters_d = cms::cuda::make_device_unique<uint32_t[]>(3, stream);
0755
0756 #ifdef GPU_DEBUG
0757     cudaCheck(cudaStreamSynchronize(stream));
0758     std::cout << "CUDA fillHitsModuleStart kernel launch \n";
0759 #endif
0760
0761     // MUST be ONE block
0762     fillHitsModuleStart<TrackerTraits><<<1, 1024, 0, stream>>>(clusters_d->clusInModule(),
0763                                                                clusters_d->clusModuleStart(),
0764                                                                clusters_d->moduleStart(),
0765                                                                nModules_Clusters_d.get());
0766
0767     nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(3, stream);
0768     cudaCheck(cudaMemcpyAsync(
0769         nModules_Clusters_h.get(), nModules_Clusters_d.get(), 3 * sizeof(uint32_t), cudaMemcpyDefault, stream));
0770
0771 #ifdef GPU_DEBUG
0772     cudaCheck(cudaStreamSynchronize(stream));
0773 #endif
0774   }  //
0775
0776   template class SiPixelRawToClusterGPUKernel<pixelTopology::Phase1>;
0777   template class SiPixelRawToClusterGPUKernel<pixelTopology::Phase2>;
0778   template class SiPixelRawToClusterGPUKernel<pixelTopology::HIonPhase1>;
0779 }  // namespace pixelgpudetails