Back to home page

Project CMSSW displayed by LXR

 
 

    


Warning, /RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu is written in an unsupported language. File is not indexed.

0001 /* Sushil Dubey, Shashi Dugad, TIFR, July 2017
0002  *
0003  * File Name: RawToClusterGPU.cu
0004  * Description: It converts Raw data into Digi Format on GPU
0005  * Finaly the Output of RawToDigi data is given to pixelClusterizer
0006 **/
0007 
0008 // C++ includes
0009 #include <cassert>
0010 #include <cstdio>
0011 #include <cstdlib>
0012 #include <cstring>
0013 #include <fstream>
0014 #include <iomanip>
0015 #include <iostream>
0016 
0017 // CUDA includes
0018 #include <cuda_runtime.h>
0019 
0020 // CMSSW includes
0021 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
0022 #include "CondFormats/SiPixelObjects/interface/SiPixelROCsStatusAndMapping.h"
0023 #include "DataFormats/FEDRawData/interface/FEDNumbering.h"
0024 #include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
0025 #include "DataFormats/SiPixelDigi/interface/SiPixelDigiConstants.h"
0026 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0027 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0028 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
0029 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h"
0030 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
0031 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
0032 // local includes
0033 #include "SiPixelRawToClusterGPUKernel.h"
0034 
0035 namespace pixelgpudetails {
0036 
0037   __device__ bool isBarrel(uint32_t rawId) {
0038     return (PixelSubdetector::PixelBarrel == ((rawId >> DetId::kSubdetOffset) & DetId::kSubdetMask));
0039   }
0040 
0041   __device__ pixelgpudetails::DetIdGPU getRawId(const SiPixelROCsStatusAndMapping *cablingMap,
0042                                                 uint8_t fed,
0043                                                 uint32_t link,
0044                                                 uint32_t roc) {
0045     uint32_t index = fed * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + roc;
0046     pixelgpudetails::DetIdGPU detId = {
0047         cablingMap->rawId[index], cablingMap->rocInDet[index], cablingMap->moduleId[index]};
0048     return detId;
0049   }
0050 
0051   //reference http://cmsdoxygen.web.cern.ch/cmsdoxygen/CMSSW_9_2_0/doc/html/dd/d31/FrameConversion_8cc_source.html
0052   //http://cmslxr.fnal.gov/source/CondFormats/SiPixelObjects/src/PixelROC.cc?v=CMSSW_9_2_0#0071
0053   // Convert local pixel to pixelgpudetails::global pixel
0054   __device__ pixelgpudetails::Pixel frameConversion(
0055       bool bpix, int side, uint32_t layer, uint32_t rocIdInDetUnit, pixelgpudetails::Pixel local) {
0056     int slopeRow = 0, slopeCol = 0;
0057     int rowOffset = 0, colOffset = 0;
0058 
0059     if (bpix) {
0060       if (side == -1 && layer != 1) {  // -Z side: 4 non-flipped modules oriented like 'dddd', except Layer 1
0061         if (rocIdInDetUnit < 8) {
0062           slopeRow = 1;
0063           slopeCol = -1;
0064           rowOffset = 0;
0065           colOffset = (8 - rocIdInDetUnit) * pixelgpudetails::numColsInRoc - 1;
0066         } else {
0067           slopeRow = -1;
0068           slopeCol = 1;
0069           rowOffset = 2 * pixelgpudetails::numRowsInRoc - 1;
0070           colOffset = (rocIdInDetUnit - 8) * pixelgpudetails::numColsInRoc;
0071         }       // if roc
0072       } else {  // +Z side: 4 non-flipped modules oriented like 'pppp', but all 8 in layer1
0073         if (rocIdInDetUnit < 8) {
0074           slopeRow = -1;
0075           slopeCol = 1;
0076           rowOffset = 2 * pixelgpudetails::numRowsInRoc - 1;
0077           colOffset = rocIdInDetUnit * pixelgpudetails::numColsInRoc;
0078         } else {
0079           slopeRow = 1;
0080           slopeCol = -1;
0081           rowOffset = 0;
0082           colOffset = (16 - rocIdInDetUnit) * pixelgpudetails::numColsInRoc - 1;
0083         }
0084       }
0085 
0086     } else {             // fpix
0087       if (side == -1) {  // pannel 1
0088         if (rocIdInDetUnit < 8) {
0089           slopeRow = 1;
0090           slopeCol = -1;
0091           rowOffset = 0;
0092           colOffset = (8 - rocIdInDetUnit) * pixelgpudetails::numColsInRoc - 1;
0093         } else {
0094           slopeRow = -1;
0095           slopeCol = 1;
0096           rowOffset = 2 * pixelgpudetails::numRowsInRoc - 1;
0097           colOffset = (rocIdInDetUnit - 8) * pixelgpudetails::numColsInRoc;
0098         }
0099       } else {  // pannel 2
0100         if (rocIdInDetUnit < 8) {
0101           slopeRow = 1;
0102           slopeCol = -1;
0103           rowOffset = 0;
0104           colOffset = (8 - rocIdInDetUnit) * pixelgpudetails::numColsInRoc - 1;
0105         } else {
0106           slopeRow = -1;
0107           slopeCol = 1;
0108           rowOffset = 2 * pixelgpudetails::numRowsInRoc - 1;
0109           colOffset = (rocIdInDetUnit - 8) * pixelgpudetails::numColsInRoc;
0110         }
0111 
0112       }  // side
0113     }
0114 
0115     uint32_t gRow = rowOffset + slopeRow * local.row;
0116     uint32_t gCol = colOffset + slopeCol * local.col;
0117     // inside frameConversion row: gRow, column: gCol
0118     pixelgpudetails::Pixel global = {gRow, gCol};
0119     return global;
0120   }
0121 
0122   // error decoding and handling copied from EventFilter/SiPixelRawToDigi/src/ErrorChecker.cc
0123   template <bool debug = false>
0124   __device__ uint8_t conversionError(uint8_t fedId, uint8_t status) {
0125     uint8_t errorType = 0;
0126 
0127     switch (status) {
0128       case (1): {
0129         if constexpr (debug)
0130           printf("Error in Fed: %i, invalid channel Id (errorType = 35\n)", fedId);
0131         errorType = 35;
0132         break;
0133       }
0134       case (2): {
0135         if constexpr (debug)
0136           printf("Error in Fed: %i, invalid ROC Id (errorType = 36)\n", fedId);
0137         errorType = 36;
0138         break;
0139       }
0140       case (3): {
0141         if constexpr (debug)
0142           printf("Error in Fed: %i, invalid dcol/pixel value (errorType = 37)\n", fedId);
0143         errorType = 37;
0144         break;
0145       }
0146       case (4): {
0147         if constexpr (debug)
0148           printf("Error in Fed: %i, dcol/pixel read out of order (errorType = 38)\n", fedId);
0149         errorType = 38;
0150         break;
0151       }
0152       default:
0153         if constexpr (debug)
0154           printf("Cabling check returned unexpected result, status = %i\n", status);
0155     };
0156 
0157     return errorType;
0158   }
0159 
0160   __device__ bool rocRowColIsValid(uint32_t rocRow, uint32_t rocCol) {
0161     /// row and column in ROC representation
0162     return ((rocRow < pixelgpudetails::numRowsInRoc) & (rocCol < pixelgpudetails::numColsInRoc));
0163   }
0164 
0165   __device__ bool dcolIsValid(uint32_t dcol, uint32_t pxid) { return ((dcol < 26) & (2 <= pxid) & (pxid < 162)); }
0166 
0167   // error decoding and handling copied from EventFilter/SiPixelRawToDigi/src/ErrorChecker.cc
0168   template <bool debug = false>
0169   __device__ uint8_t
0170   checkROC(uint32_t errorWord, uint8_t fedId, uint32_t link, const SiPixelROCsStatusAndMapping *cablingMap) {
0171     uint8_t errorType = (errorWord >> sipixelconstants::ROC_shift) & sipixelconstants::ERROR_mask;
0172     if (errorType < 25)
0173       return 0;
0174     bool errorFound = false;
0175 
0176     switch (errorType) {
0177       case (25): {
0178         errorFound = true;
0179         uint32_t index = fedId * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + 1;
0180         if (index > 1 && index <= cablingMap->size) {
0181           if (!(link == cablingMap->link[index] && 1 == cablingMap->roc[index]))
0182             errorFound = false;
0183         }
0184         if constexpr (debug)
0185           if (errorFound)
0186             printf("Invalid ROC = 25 found (errorType = 25)\n");
0187         break;
0188       }
0189       case (26): {
0190         if constexpr (debug)
0191           printf("Gap word found (errorType = 26)\n");
0192         errorFound = true;
0193         break;
0194       }
0195       case (27): {
0196         if constexpr (debug)
0197           printf("Dummy word found (errorType = 27)\n");
0198         errorFound = true;
0199         break;
0200       }
0201       case (28): {
0202         if constexpr (debug)
0203           printf("Error fifo nearly full (errorType = 28)\n");
0204         errorFound = true;
0205         break;
0206       }
0207       case (29): {
0208         if constexpr (debug)
0209           printf("Timeout on a channel (errorType = 29)\n");
0210         if ((errorWord >> sipixelconstants::OMIT_ERR_shift) & sipixelconstants::OMIT_ERR_mask) {
0211           if constexpr (debug)
0212             printf("...first errorType=29 error, this gets masked out\n");
0213         }
0214         errorFound = true;
0215         break;
0216       }
0217       case (30): {
0218         if constexpr (debug)
0219           printf("TBM error trailer (errorType = 30)\n");
0220         int stateMatch_bits = 4;
0221         int stateMatch_shift = 8;
0222         uint32_t stateMatch_mask = ~(~uint32_t(0) << stateMatch_bits);
0223         int stateMatch = (errorWord >> stateMatch_shift) & stateMatch_mask;
0224         if (stateMatch != 1 && stateMatch != 8) {
0225           if constexpr (debug)
0226             printf("FED error 30 with unexpected State Bits (errorType = 30)\n");
0227         }
0228         if (stateMatch == 1)
0229           errorType = 40;  // 1=Overflow -> 40, 8=number of ROCs -> 30
0230         errorFound = true;
0231         break;
0232       }
0233       case (31): {
0234         if constexpr (debug)
0235           printf("Event number error (errorType = 31)\n");
0236         errorFound = true;
0237         break;
0238       }
0239       default:
0240         errorFound = false;
0241     };
0242 
0243     return errorFound ? errorType : 0;
0244   }
0245 
0246   // error decoding and handling copied from EventFilter/SiPixelRawToDigi/src/ErrorChecker.cc
0247   template <bool debug = false>
0248   __device__ uint32_t
0249   getErrRawID(uint8_t fedId, uint32_t errWord, uint32_t errorType, const SiPixelROCsStatusAndMapping *cablingMap) {
0250     uint32_t rID = 0xffffffff;
0251 
0252     switch (errorType) {
0253       case 25:
0254       case 30:
0255       case 31:
0256       case 36:
0257       case 40: {
0258         uint32_t roc = 1;
0259         uint32_t link = sipixelconstants::getLink(errWord);
0260         uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
0261         if (rID_temp != gpuClustering::invalidModuleId)
0262           rID = rID_temp;
0263         break;
0264       }
0265       case 29: {
0266         int chanNmbr = 0;
0267         const int DB0_shift = 0;
0268         const int DB1_shift = DB0_shift + 1;
0269         const int DB2_shift = DB1_shift + 1;
0270         const int DB3_shift = DB2_shift + 1;
0271         const int DB4_shift = DB3_shift + 1;
0272         const uint32_t DataBit_mask = ~(~uint32_t(0) << 1);
0273 
0274         int CH1 = (errWord >> DB0_shift) & DataBit_mask;
0275         int CH2 = (errWord >> DB1_shift) & DataBit_mask;
0276         int CH3 = (errWord >> DB2_shift) & DataBit_mask;
0277         int CH4 = (errWord >> DB3_shift) & DataBit_mask;
0278         int CH5 = (errWord >> DB4_shift) & DataBit_mask;
0279         int BLOCK_bits = 3;
0280         int BLOCK_shift = 8;
0281         uint32_t BLOCK_mask = ~(~uint32_t(0) << BLOCK_bits);
0282         int BLOCK = (errWord >> BLOCK_shift) & BLOCK_mask;
0283         int localCH = 1 * CH1 + 2 * CH2 + 3 * CH3 + 4 * CH4 + 5 * CH5;
0284         if (BLOCK % 2 == 0)
0285           chanNmbr = (BLOCK / 2) * 9 + localCH;
0286         else
0287           chanNmbr = ((BLOCK - 1) / 2) * 9 + 4 + localCH;
0288         if ((chanNmbr < 1) || (chanNmbr > 36))
0289           break;  // signifies unexpected result
0290 
0291         uint32_t roc = 1;
0292         uint32_t link = chanNmbr;
0293         uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
0294         if (rID_temp != gpuClustering::invalidModuleId)
0295           rID = rID_temp;
0296         break;
0297       }
0298       case 37:
0299       case 38: {
0300         uint32_t roc = sipixelconstants::getROC(errWord);
0301         uint32_t link = sipixelconstants::getLink(errWord);
0302         uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId;
0303         if (rID_temp != gpuClustering::invalidModuleId)
0304           rID = rID_temp;
0305         break;
0306       }
0307       default:
0308         break;
0309     };
0310 
0311     return rID;
0312   }
0313 
0314   // Kernel to perform Raw to Digi conversion
0315   template <bool debug = false>
0316   __global__ void RawToDigi_kernel(const SiPixelROCsStatusAndMapping *cablingMap,
0317                                    const unsigned char *modToUnp,
0318                                    const uint32_t wordCounter,
0319                                    const uint32_t *word,
0320                                    const uint8_t *fedIds,
0321                                    SiPixelDigisCUDASOAView digisView,
0322                                    cms::cuda::SimpleVector<SiPixelErrorCompact> *err,
0323                                    bool useQualityInfo,
0324                                    bool includeErrors) {
0325     //if (threadIdx.x==0) printf("Event: %u blockIdx.x: %u start: %u end: %u\n", eventno, blockIdx.x, begin, end);
0326 
0327     int32_t first = threadIdx.x + blockIdx.x * blockDim.x;
0328     for (int32_t iloop = first, nend = wordCounter; iloop < nend; iloop += blockDim.x * gridDim.x) {
0329       auto gIndex = iloop;
0330       auto dvgi = digisView[gIndex];
0331       dvgi.xx() = 0;
0332       dvgi.yy() = 0;
0333       dvgi.adc() = 0;
0334       bool skipROC = false;
0335 
0336       uint8_t fedId = fedIds[gIndex / 2];  // +1200;
0337 
0338       // initialize (too many coninue below)
0339       dvgi.pdigi() = 0;
0340       dvgi.rawIdArr() = 0;
0341       dvgi.moduleId() = gpuClustering::invalidModuleId;
0342 
0343       uint32_t ww = word[gIndex];  // Array containing 32 bit raw data
0344       if (ww == 0) {
0345         // 0 is an indicator of a noise/dead channel, skip these pixels during clusterization
0346         continue;
0347       }
0348 
0349       uint32_t link = sipixelconstants::getLink(ww);  // Extract link
0350       uint32_t roc = sipixelconstants::getROC(ww);    // Extract ROC in link
0351 
0352       uint8_t errorType = checkROC<debug>(ww, fedId, link, cablingMap);
0353       skipROC = (roc < pixelgpudetails::maxROCIndex) ? false : (errorType != 0);
0354       if (includeErrors and skipROC) {
0355         uint32_t rID = getErrRawID<debug>(fedId, ww, errorType, cablingMap);
0356         err->push_back(SiPixelErrorCompact{rID, ww, errorType, fedId});
0357         continue;
0358       }
0359 
0360       // check for spurious channels
0361       if (roc > MAX_ROC or link > MAX_LINK) {
0362         if constexpr (debug) {
0363           printf("spurious roc %d found on link %d, detector %d (index %d)\n",
0364                  roc,
0365                  link,
0366                  getRawId(cablingMap, fedId, link, 1).rawId,
0367                  gIndex);
0368         }
0369         continue;
0370       }
0371 
0372       uint32_t index = fedId * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + roc;
0373       if (useQualityInfo) {
0374         skipROC = cablingMap->badRocs[index];
0375         if (skipROC)
0376           continue;
0377       }
0378       skipROC = modToUnp[index];
0379       if (skipROC)
0380         continue;
0381 
0382       pixelgpudetails::DetIdGPU detId = getRawId(cablingMap, fedId, link, roc);
0383       uint32_t rawId = detId.rawId;
0384       uint32_t layer = 0;
0385       int side = 0, panel = 0, module = 0;
0386       bool barrel = isBarrel(rawId);
0387       if (barrel) {
0388         layer = (rawId >> pixelgpudetails::layerStartBit) & pixelgpudetails::layerMask;
0389         module = (rawId >> pixelgpudetails::moduleStartBit) & pixelgpudetails::moduleMask;
0390         side = (module < 5) ? -1 : 1;
0391       } else {
0392         // endcap ids
0393         layer = 0;
0394         panel = (rawId >> pixelgpudetails::panelStartBit) & pixelgpudetails::panelMask;
0395         side = (panel == 1) ? -1 : 1;
0396       }
0397 
0398       // ***special case of layer to 1 be handled here
0399       pixelgpudetails::Pixel localPix;
0400       if (layer == 1) {
0401         uint32_t col = sipixelconstants::getCol(ww);
0402         uint32_t row = sipixelconstants::getRow(ww);
0403         localPix.row = row;
0404         localPix.col = col;
0405         if (includeErrors) {
0406           if (not rocRowColIsValid(row, col)) {
0407             uint8_t error = conversionError<debug>(fedId, 3);  //use the device function and fill the arrays
0408             err->push_back(SiPixelErrorCompact{rawId, ww, error, fedId});
0409             if constexpr (debug)
0410               printf("BPIX1  Error status: %i\n", error);
0411             continue;
0412           }
0413         }
0414       } else {
0415         // ***conversion rules for dcol and pxid
0416         uint32_t dcol = sipixelconstants::getDCol(ww);
0417         uint32_t pxid = sipixelconstants::getPxId(ww);
0418         uint32_t row = pixelgpudetails::numRowsInRoc - pxid / 2;
0419         uint32_t col = dcol * 2 + pxid % 2;
0420         localPix.row = row;
0421         localPix.col = col;
0422         if (includeErrors and not dcolIsValid(dcol, pxid)) {
0423           uint8_t error = conversionError<debug>(fedId, 3);
0424           err->push_back(SiPixelErrorCompact{rawId, ww, error, fedId});
0425           if constexpr (debug)
0426             printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc);
0427           continue;
0428         }
0429       }
0430 
0431       pixelgpudetails::Pixel globalPix = frameConversion(barrel, side, layer, detId.rocInDet, localPix);
0432       dvgi.xx() = globalPix.row;  // origin shifting by 1 0-159
0433       dvgi.yy() = globalPix.col;  // origin shifting by 1 0-415
0434       dvgi.adc() = sipixelconstants::getADC(ww);
0435       dvgi.pdigi() = pixelgpudetails::pack(globalPix.row, globalPix.col, dvgi.adc());
0436       dvgi.moduleId() = detId.moduleId;
0437       dvgi.rawIdArr() = rawId;
0438     }  // end of loop (gIndex < end)
0439 
0440   }  // end of Raw to Digi kernel
0441 
0442   template <typename TrackerTraits>
0443   __global__ void fillHitsModuleStart(uint32_t const *__restrict__ clusInModule,
0444                                       uint32_t *__restrict__ moduleStart,
0445                                       uint32_t const *__restrict__ nModules,
0446                                       uint32_t *__restrict__ nModules_Clusters) {
0447     constexpr int nMaxModules = TrackerTraits::numberOfModules;
0448     constexpr int startBPIX2 = TrackerTraits::layerStart[1];
0449 
0450     assert(startBPIX2 < nMaxModules);
0451     assert(nMaxModules < 4096);  // easy to extend at least till 32*1024
0452     assert(nMaxModules > 1024);
0453 
0454     assert(1 == gridDim.x);
0455     assert(0 == blockIdx.x);
0456 
0457     int first = threadIdx.x;
0458 
0459     // limit to MaxHitsInModule;
0460     for (int i = first, iend = nMaxModules; i < iend; i += blockDim.x) {
0461       moduleStart[i + 1] = std::min(gpuClustering::maxHitsInModule(), clusInModule[i]);
0462     }
0463 
0464     constexpr bool isPhase2 = std::is_base_of<pixelTopology::Phase2, TrackerTraits>::value;
0465     __shared__ uint32_t ws[32];
0466     cms::cuda::blockPrefixScan(moduleStart + 1, moduleStart + 1, 1024, ws);
0467     constexpr int lastModules = isPhase2 ? 1024 : nMaxModules - 1024;
0468     cms::cuda::blockPrefixScan(moduleStart + 1024 + 1, moduleStart + 1024 + 1, lastModules, ws);
0469 
0470     if constexpr (isPhase2) {
0471       cms::cuda::blockPrefixScan(moduleStart + 2048 + 1, moduleStart + 2048 + 1, 1024, ws);
0472       cms::cuda::blockPrefixScan(moduleStart + 3072 + 1, moduleStart + 3072 + 1, nMaxModules - 3072, ws);
0473     }
0474 
0475     for (int i = first + 1025, iend = isPhase2 ? 2049 : nMaxModules + 1; i < iend; i += blockDim.x) {
0476       moduleStart[i] += moduleStart[1024];
0477     }
0478     __syncthreads();
0479 
0480     if constexpr (isPhase2) {
0481       for (int i = first + 2049, iend = 3073; i < iend; i += blockDim.x) {
0482         moduleStart[i] += moduleStart[2048];
0483       }
0484       __syncthreads();
0485       for (int i = first + 3073, iend = nMaxModules + 1; i < iend; i += blockDim.x) {
0486         moduleStart[i] += moduleStart[3072];
0487       }
0488       __syncthreads();
0489     }
0490 
0491     if (threadIdx.x == 0) {
0492       // copy the number of modules
0493       nModules_Clusters[0] = *nModules;
0494       // last element holds the number of all clusters
0495       nModules_Clusters[1] = moduleStart[nMaxModules];
0496       // element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1)
0497       nModules_Clusters[2] = moduleStart[startBPIX2];
0498     }
0499 
0500 #ifdef GPU_DEBUG
0501     uint16_t maxH = isPhase2 ? 3027 : 1024;
0502     assert(0 == moduleStart[0]);
0503     auto c0 = std::min(gpuClustering::maxHitsInModule(), clusInModule[0]);
0504     assert(c0 == moduleStart[1]);
0505     assert(moduleStart[maxH] >= moduleStart[maxH - 1]);
0506     assert(moduleStart[maxH + 1] >= moduleStart[maxH]);
0507     assert(moduleStart[nMaxModules] >= moduleStart[maxH + 1]);
0508 
0509     constexpr int startFP1 = TrackerTraits::numberOfModulesInBarrel;
0510     constexpr int startLastFwd = TrackerTraits::layerStart[TrackerTraits::numberOfLayers];
0511     for (int i = first, iend = nMaxModules + 1; i < iend; i += blockDim.x) {
0512       if (0 != i)
0513         assert(moduleStart[i] >= moduleStart[i - i]);
0514       // [BPX1, BPX2, BPX3, BPX4,  FP1,  FP2,  FP3,  FN1,  FN2,  FN3, LAST_VALID]
0515       // [   0,   96,  320,  672, 1184, 1296, 1408, 1520, 1632, 1744,       1856]
0516       if (i == startBPIX2 || i == startFP1 || i == startLastFwd || i == nMaxModules)
0517         printf("moduleStart %d %d\n", i, moduleStart[i]);
0518     }
0519 
0520 #endif
0521   }
0522 
0523   // Interface to outside
0524   void SiPixelRawToClusterGPUKernel::makeClustersAsync(bool isRun2,
0525                                                        const SiPixelClusterThresholds clusterThresholds,
0526                                                        const SiPixelROCsStatusAndMapping *cablingMap,
0527                                                        const unsigned char *modToUnp,
0528                                                        const SiPixelGainForHLTonGPU *gains,
0529                                                        const WordFedAppender &wordFed,
0530                                                        SiPixelFormatterErrors &&errors,
0531                                                        const uint32_t wordCounter,
0532                                                        const uint32_t fedCounter,
0533                                                        bool useQualityInfo,
0534                                                        bool includeErrors,
0535                                                        bool debug,
0536                                                        cudaStream_t stream) {
0537     using pixelTopology::Phase1;
0538     // we're not opting for calling this function in case of early events
0539     assert(wordCounter != 0);
0540     nDigis = wordCounter;
0541 
0542 #ifdef GPU_DEBUG
0543     std::cout << "decoding " << wordCounter << " digis." << std::endl;
0544 #endif
0545 
0546     // since wordCounter != 0 we're not allocating 0 bytes,
0547     // digis_d = SiPixelDigisCUDA(wordCounter, stream);
0548     digis_d = SiPixelDigisCUDA(size_t(wordCounter), stream);
0549     if (includeErrors) {
0550       digiErrors_d = SiPixelDigiErrorsCUDA(wordCounter, std::move(errors), stream);
0551     }
0552     clusters_d = SiPixelClustersCUDA(Phase1::numberOfModules, stream);
0553 
0554     // Begin Raw2Digi block
0555     {
0556       const int threadsPerBlock = 512;
0557       const int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock;  // fill it all
0558 
0559       assert(0 == wordCounter % 2);
0560       // wordCounter is the total no of words in each event to be trasfered on device
0561       auto word_d = cms::cuda::make_device_unique<uint32_t[]>(wordCounter, stream);
0562       auto fedId_d = cms::cuda::make_device_unique<uint8_t[]>(wordCounter, stream);
0563 
0564       cudaCheck(
0565           cudaMemcpyAsync(word_d.get(), wordFed.word(), wordCounter * sizeof(uint32_t), cudaMemcpyDefault, stream));
0566       cudaCheck(cudaMemcpyAsync(
0567           fedId_d.get(), wordFed.fedId(), wordCounter * sizeof(uint8_t) / 2, cudaMemcpyDefault, stream));
0568 
0569       // Launch rawToDigi kernel
0570       if (debug)
0571         RawToDigi_kernel<true><<<blocks, threadsPerBlock, 0, stream>>>(  //
0572             cablingMap,
0573             modToUnp,
0574             wordCounter,
0575             word_d.get(),
0576             fedId_d.get(),
0577             digis_d.view(),
0578             digiErrors_d.error(),  // returns nullptr if default-constructed
0579             useQualityInfo,
0580             includeErrors);
0581       else
0582         RawToDigi_kernel<false><<<blocks, threadsPerBlock, 0, stream>>>(  //
0583             cablingMap,
0584             modToUnp,
0585             wordCounter,
0586             word_d.get(),
0587             fedId_d.get(),
0588             digis_d.view(),
0589             digiErrors_d.error(),  // returns nullptr if default-constructed
0590             useQualityInfo,
0591             includeErrors);
0592       cudaCheck(cudaGetLastError());
0593 #ifdef GPU_DEBUG
0594       cudaCheck(cudaStreamSynchronize(stream));
0595 #endif
0596 
0597       if (includeErrors) {
0598         digiErrors_d.copyErrorToHostAsync(stream);
0599       }
0600     }
0601     // End of Raw2Digi and passing data for clustering
0602 
0603     {
0604       // clusterizer ...
0605       using namespace gpuClustering;
0606       int threadsPerBlock = 256;
0607       int blocks = (std::max(int(wordCounter), int(Phase1::numberOfModules)) + threadsPerBlock - 1) / threadsPerBlock;
0608 
0609       if (isRun2)
0610         gpuCalibPixel::calibDigis<true><<<blocks, threadsPerBlock, 0, stream>>>(digis_d->moduleId(),
0611                                                                                 digis_d->xx(),
0612                                                                                 digis_d->yy(),
0613                                                                                 digis_d->adc(),
0614                                                                                 gains,
0615                                                                                 wordCounter,
0616                                                                                 clusters_d->moduleStart(),
0617                                                                                 clusters_d->clusInModule(),
0618                                                                                 clusters_d->clusModuleStart());
0619       else
0620         gpuCalibPixel::calibDigis<false><<<blocks, threadsPerBlock, 0, stream>>>(digis_d->moduleId(),
0621                                                                                  digis_d->xx(),
0622                                                                                  digis_d->yy(),
0623                                                                                  digis_d->adc(),
0624                                                                                  gains,
0625                                                                                  wordCounter,
0626                                                                                  clusters_d->moduleStart(),
0627                                                                                  clusters_d->clusInModule(),
0628                                                                                  clusters_d->clusModuleStart());
0629 
0630       cudaCheck(cudaGetLastError());
0631 #ifdef GPU_DEBUG
0632       cudaCheck(cudaStreamSynchronize(stream));
0633 #endif
0634 
0635 #ifdef GPU_DEBUG
0636       std::cout << "CUDA countModules kernel launch with " << blocks << " blocks of " << threadsPerBlock
0637                 << " threads\n";
0638 #endif
0639 
0640       countModules<Phase1><<<blocks, threadsPerBlock, 0, stream>>>(
0641           digis_d->moduleId(), clusters_d->moduleStart(), digis_d->clus(), wordCounter);
0642       cudaCheck(cudaGetLastError());
0643 
0644       threadsPerBlock = 256 + 128;  /// should be larger than 6000/16 aka (maxPixInModule/maxiter in the kernel)
0645       blocks = phase1PixelTopology::numberOfModules;
0646 #ifdef GPU_DEBUG
0647       std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
0648 #endif
0649 
0650       findClus<Phase1><<<blocks, threadsPerBlock, 0, stream>>>(digis_d->rawIdArr(),
0651                                                                digis_d->moduleId(),
0652                                                                digis_d->xx(),
0653                                                                digis_d->yy(),
0654                                                                clusters_d->moduleStart(),
0655                                                                clusters_d->clusInModule(),
0656                                                                clusters_d->moduleId(),
0657                                                                digis_d->clus(),
0658                                                                wordCounter);
0659 
0660       cudaCheck(cudaGetLastError());
0661 #ifdef GPU_DEBUG
0662       cudaCheck(cudaStreamSynchronize(stream));
0663 #endif
0664 
0665       // apply charge cut
0666       clusterChargeCut<Phase1><<<blocks, threadsPerBlock, 0, stream>>>(clusterThresholds,
0667                                                                        digis_d->moduleId(),
0668                                                                        digis_d->adc(),
0669                                                                        clusters_d->moduleStart(),
0670                                                                        clusters_d->clusInModule(),
0671                                                                        clusters_d->moduleId(),
0672                                                                        digis_d->clus(),
0673                                                                        wordCounter);
0674 
0675       cudaCheck(cudaGetLastError());
0676 
0677       // count the module start indices already here (instead of
0678       // rechits) so that the number of clusters/hits can be made
0679       // available in the rechit producer without additional points of
0680       // synchronization/ExternalWork
0681       auto nModules_Clusters_d = cms::cuda::make_device_unique<uint32_t[]>(3, stream);
0682       // MUST be ONE block
0683       fillHitsModuleStart<Phase1><<<1, 1024, 0, stream>>>(clusters_d->clusInModule(),
0684                                                           clusters_d->clusModuleStart(),
0685                                                           clusters_d->moduleStart(),
0686                                                           nModules_Clusters_d.get());
0687 
0688       // copy to host
0689       nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(3, stream);
0690       cudaCheck(cudaMemcpyAsync(
0691           nModules_Clusters_h.get(), nModules_Clusters_d.get(), 3 * sizeof(uint32_t), cudaMemcpyDefault, stream));
0692 
0693 #ifdef GPU_DEBUG
0694       cudaCheck(cudaStreamSynchronize(stream));
0695 #endif
0696 
0697     }  // end clusterizer scope
0698   }
0699 
0700   void SiPixelRawToClusterGPUKernel::makePhase2ClustersAsync(const SiPixelClusterThresholds clusterThresholds,
0701                                                              const uint16_t *moduleIds,
0702                                                              const uint16_t *xDigis,
0703                                                              const uint16_t *yDigis,
0704                                                              const uint16_t *adcDigis,
0705                                                              const uint32_t *packedData,
0706                                                              const uint32_t *rawIds,
0707                                                              const uint32_t numDigis,
0708                                                              cudaStream_t stream) {
0709     using namespace gpuClustering;
0710     using pixelTopology::Phase2;
0711     nDigis = numDigis;
0712     digis_d = SiPixelDigisCUDA(numDigis, stream);
0713 
0714     cudaCheck(cudaMemcpyAsync(digis_d->moduleId(), moduleIds, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
0715     cudaCheck(cudaMemcpyAsync(digis_d->xx(), xDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
0716     cudaCheck(cudaMemcpyAsync(digis_d->yy(), yDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
0717     cudaCheck(cudaMemcpyAsync(digis_d->adc(), adcDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
0718     cudaCheck(cudaMemcpyAsync(digis_d->pdigi(), packedData, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream));
0719     cudaCheck(cudaMemcpyAsync(digis_d->rawIdArr(), rawIds, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream));
0720 
0721     clusters_d = SiPixelClustersCUDA(Phase2::numberOfModules, stream);
0722 
0723     nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(2, stream);
0724 
0725     int threadsPerBlock = 512;
0726     int blocks = (int(numDigis) + threadsPerBlock - 1) / threadsPerBlock;
0727 
0728     gpuCalibPixel::calibDigisPhase2<<<blocks, threadsPerBlock, 0, stream>>>(digis_d->moduleId(),
0729                                                                             digis_d->adc(),
0730                                                                             numDigis,
0731                                                                             clusters_d->moduleStart(),
0732                                                                             clusters_d->clusInModule(),
0733                                                                             clusters_d->clusModuleStart());
0734 
0735     cudaCheck(cudaGetLastError());
0736 
0737 #ifdef GPU_DEBUG
0738     cudaCheck(cudaStreamSynchronize(stream));
0739     std::cout << "CUDA countModules kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
0740 #endif
0741 
0742     countModules<Phase2><<<blocks, threadsPerBlock, 0, stream>>>(
0743         digis_d->moduleId(), clusters_d->moduleStart(), digis_d->clus(), numDigis);
0744     cudaCheck(cudaGetLastError());
0745 
0746     // read the number of modules into a data member, used by getProduct())
0747     cudaCheck(cudaMemcpyAsync(
0748         &(nModules_Clusters_h[0]), clusters_d->moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream));
0749 
0750     threadsPerBlock = 256;
0751     blocks = Phase2::numberOfModules;
0752 
0753 #ifdef GPU_DEBUG
0754     cudaCheck(cudaStreamSynchronize(stream));
0755     std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
0756 #endif
0757     findClus<Phase2><<<blocks, threadsPerBlock, 0, stream>>>(digis_d->rawIdArr(),
0758                                                              digis_d->moduleId(),
0759                                                              digis_d->xx(),
0760                                                              digis_d->yy(),
0761                                                              clusters_d->moduleStart(),
0762                                                              clusters_d->clusInModule(),
0763                                                              clusters_d->moduleId(),
0764                                                              digis_d->clus(),
0765                                                              numDigis);
0766 
0767     cudaCheck(cudaGetLastError());
0768 #ifdef GPU_DEBUG
0769     cudaCheck(cudaStreamSynchronize(stream));
0770     std::cout << "CUDA clusterChargeCut kernel launch with " << blocks << " blocks of " << threadsPerBlock
0771               << " threads\n";
0772 #endif
0773 
0774     // apply charge cut
0775     clusterChargeCut<Phase2><<<blocks, threadsPerBlock, 0, stream>>>(clusterThresholds,
0776                                                                      digis_d->moduleId(),
0777                                                                      digis_d->adc(),
0778                                                                      clusters_d->moduleStart(),
0779                                                                      clusters_d->clusInModule(),
0780                                                                      clusters_d->moduleId(),
0781                                                                      digis_d->clus(),
0782                                                                      numDigis);
0783     cudaCheck(cudaGetLastError());
0784 
0785     auto nModules_Clusters_d = cms::cuda::make_device_unique<uint32_t[]>(3, stream);
0786     // MUST be ONE block
0787 
0788 #ifdef GPU_DEBUG
0789     cudaCheck(cudaStreamSynchronize(stream));
0790     std::cout << "CUDA fillHitsModuleStart kernel launch \n";
0791 #endif
0792 
0793     fillHitsModuleStart<Phase2><<<1, 1024, 0, stream>>>(clusters_d->clusInModule(),
0794                                                         clusters_d->clusModuleStart(),
0795                                                         clusters_d->moduleStart(),
0796                                                         nModules_Clusters_d.get());
0797 
0798     nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(3, stream);
0799     cudaCheck(cudaMemcpyAsync(
0800         nModules_Clusters_h.get(), nModules_Clusters_d.get(), 3 * sizeof(uint32_t), cudaMemcpyDefault, stream));
0801 
0802 #ifdef GPU_DEBUG
0803     cudaCheck(cudaStreamSynchronize(stream));
0804 #endif
0805   }  //
0806 }  // namespace pixelgpudetails