SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cu

0001 //#define GPU_DEBUG
0002 #if defined(EDM_ML_DEBUG) || defined(GPU_DEBUG)
0003 #define GPU_CHECK
0004 #include <cstdio>
0005 #endif
0006
0007 #include <cub/cub.cuh>
0008 // prevent _Float16 defined by CUDA headers from hiding the ISO C type used by GCC
0009 #ifdef _Float16
0010 #undef _Float16
0011 #endif
0012
0013 #include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
0014 #include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
0015 #include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
0016 #include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h"
0017 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
0018 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
0019 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0020 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
0021
0022 #include "ChannelLocsGPU.h"
0023 #include "SiStripRawToClusterGPUKernel.h"
0024 #include "StripDataView.h"
0025
0026 using namespace stripgpu;
0027 using ConditionsDeviceView = SiStripClusterizerConditionsGPU::Data::DeviceView;
0028
0029 __global__ static void unpackChannels(const ChannelLocsView *chanlocs,
0030                                       const ConditionsDeviceView *conditions,
0031                                       uint8_t *alldata,
0032                                       uint16_t *channel,
0033                                       stripId_t *stripId) {
0034   const int tid = threadIdx.x;
0035   const int bid = blockIdx.x;
0036   const int nthreads = blockDim.x;
0037
0038   const auto first = nthreads * bid + tid;
0039   const auto stride = blockDim.x * gridDim.x;
0040   for (auto chan = first; chan < chanlocs->size(); chan += stride) {
0041     const auto fedid = chanlocs->fedID(chan);
0042     const auto fedch = chanlocs->fedCh(chan);
0043     const auto ipair = conditions->iPair(fedid, fedch);
0044     const auto ipoff = sistrip::STRIPS_PER_FEDCH * ipair;
0045
0046     const auto data = chanlocs->input(chan);
0047     const auto len = chanlocs->length(chan);
0048
0049     if (data != nullptr && len > 0) {
0050       auto aoff = chanlocs->offset(chan);
0051       auto choff = chanlocs->inoff(chan);
0052       const auto end = choff + len;
0053
0054       while (choff < end) {
0055         auto stripIndex = data[(choff++) ^ 7] + ipoff;
0056         const auto groupLength = data[(choff++) ^ 7];
0057
0058         for (auto i = 0; i < 2; ++i) {
0059           stripId[aoff] = invalidStrip;
0060           alldata[aoff++] = 0;
0061         }
0062
0063         for (auto i = 0; i < groupLength; ++i) {
0064           stripId[aoff] = stripIndex++;
0065           channel[aoff] = chan;
0066           alldata[aoff++] = data[(choff++) ^ 7];
0067         }
0068       }
0069     }  // choff < end
0070   }  // data != nullptr && len > 0
0071 }  // chan < chanlocs->size()
0072
0073 __global__ static void setSeedStripsGPU(StripDataView *sst_data_d, const ConditionsDeviceView *conditions) {
0074   const int nStrips = sst_data_d->nStrips;
0075   const auto __restrict__ chanlocs = sst_data_d->chanlocs;
0076   const uint8_t *__restrict__ adc = sst_data_d->adc;
0077   const uint16_t *__restrict__ channels = sst_data_d->channel;
0078   const uint16_t *__restrict__ stripId = sst_data_d->stripId;
0079   int *__restrict__ seedStripsMask = sst_data_d->seedStripsMask;
0080   int *__restrict__ seedStripsNCMask = sst_data_d->seedStripsNCMask;
0081   const float seedThreshold = sst_data_d->seedThreshold;
0082
0083   const int tid = threadIdx.x;
0084   const int bid = blockIdx.x;
0085   const int nthreads = blockDim.x;
0086   const int first = nthreads * bid + tid;
0087   const int stride = blockDim.x * gridDim.x;
0088
0089   for (int i = first; i < nStrips; i += stride) {
0090     seedStripsMask[i] = 0;
0091     seedStripsNCMask[i] = 0;
0092     const stripId_t strip = stripId[i];
0093     if (strip != invalidStrip) {
0094       const auto chan = channels[i];
0095       const fedId_t fed = chanlocs->fedID(chan);
0096       const fedCh_t channel = chanlocs->fedCh(chan);
0097       const float noise_i = conditions->noise(fed, channel, strip);
0098       const uint8_t adc_i = adc[i];
0099
0100       seedStripsMask[i] = (adc_i >= static_cast<uint8_t>(noise_i * seedThreshold)) ? 1 : 0;
0101       seedStripsNCMask[i] = seedStripsMask[i];
0102     }
0103   }
0104 }
0105
0106 __global__ static void setNCSeedStripsGPU(StripDataView *sst_data_d, const ConditionsDeviceView *conditions) {
0107   const int nStrips = sst_data_d->nStrips;
0108   const auto __restrict__ chanlocs = sst_data_d->chanlocs;
0109   const uint16_t *__restrict__ channels = sst_data_d->channel;
0110   const uint16_t *__restrict__ stripId = sst_data_d->stripId;
0111   const int *__restrict__ seedStripsMask = sst_data_d->seedStripsMask;
0112   int *__restrict__ seedStripsNCMask = sst_data_d->seedStripsNCMask;
0113
0114   const int tid = threadIdx.x;
0115   const int bid = blockIdx.x;
0116   const int nthreads = blockDim.x;
0117   const int first = nthreads * bid + tid;
0118   const int stride = blockDim.x * gridDim.x;
0119
0120   for (int i = first; i < nStrips; i += stride) {
0121     if (i > 0) {
0122       const auto detid = chanlocs->detID(channels[i]);
0123       const auto detid1 = chanlocs->detID(channels[i - 1]);
0124
0125       if (seedStripsMask[i] && seedStripsMask[i - 1] && (stripId[i] - stripId[i - 1]) == 1 && (detid == detid1))
0126         seedStripsNCMask[i] = 0;
0127     }
0128   }
0129 }
0130
0131 __global__ static void setStripIndexGPU(StripDataView *sst_data_d) {
0132   const int nStrips = sst_data_d->nStrips;
0133   const int *__restrict__ seedStripsNCMask = sst_data_d->seedStripsNCMask;
0134   const int *__restrict__ prefixSeedStripsNCMask = sst_data_d->prefixSeedStripsNCMask;
0135   int *__restrict__ seedStripsNCIndex = sst_data_d->seedStripsNCIndex;
0136
0137   const int tid = threadIdx.x;
0138   const int bid = blockIdx.x;
0139   const int nthreads = blockDim.x;
0140   const int first = nthreads * bid + tid;
0141   const int stride = blockDim.x * gridDim.x;
0142
0143   for (int i = first; i < nStrips; i += stride) {
0144     if (seedStripsNCMask[i] == 1) {
0145       const int index = prefixSeedStripsNCMask[i];
0146       seedStripsNCIndex[index] = i;
0147     }
0148   }
0149 }
0150
0151 __global__ static void findLeftRightBoundaryGPU(const StripDataView *sst_data_d,
0152                                                 const ConditionsDeviceView *conditions,
0153                                                 SiStripClustersCUDADevice::DeviceView *clust_data_d) {
0154   const int nStrips = sst_data_d->nStrips;
0155   const int *__restrict__ seedStripsNCIndex = sst_data_d->seedStripsNCIndex;
0156   const auto __restrict__ chanlocs = sst_data_d->chanlocs;
0157   const uint16_t *__restrict__ stripId = sst_data_d->stripId;
0158   const uint16_t *__restrict__ channels = sst_data_d->channel;
0159   const uint8_t *__restrict__ adc = sst_data_d->adc;
0160   const int nSeedStripsNC = std::min(kMaxSeedStrips, *(sst_data_d->prefixSeedStripsNCMask + nStrips - 1));
0161   const uint8_t maxSequentialHoles = sst_data_d->maxSequentialHoles;
0162   const float channelThreshold = sst_data_d->channelThreshold;
0163   const float clusterThresholdSquared = sst_data_d->clusterThresholdSquared;
0164   const int clusterSizeLimit = sst_data_d->clusterSizeLimit;
0165
0166   auto __restrict__ clusterIndexLeft = clust_data_d->clusterIndex_;
0167   auto __restrict__ clusterSize = clust_data_d->clusterSize_;
0168   auto __restrict__ clusterDetId = clust_data_d->clusterDetId_;
0169   auto __restrict__ firstStrip = clust_data_d->firstStrip_;
0170   auto __restrict__ trueCluster = clust_data_d->trueCluster_;
0171
0172   const int tid = threadIdx.x;
0173   const int bid = blockIdx.x;
0174   const int nthreads = blockDim.x;
0175   const int first = nthreads * bid + tid;
0176   const int stride = blockDim.x * gridDim.x;
0177
0178   for (int i = first; i < nSeedStripsNC; i += stride) {
0179     const auto index = seedStripsNCIndex[i];
0180     const auto chan = channels[index];
0181     const auto fed = chanlocs->fedID(chan);
0182     const auto channel = chanlocs->fedCh(chan);
0183     const auto det = chanlocs->detID(chan);
0184     const auto strip = stripId[index];
0185     const auto noise_i = conditions->noise(fed, channel, strip);
0186
0187     auto noiseSquared_i = noise_i * noise_i;
0188     float adcSum_i = static_cast<float>(adc[index]);
0189     auto testIndex = index - 1;
0190     auto size = 1;
0191
0192     auto addtocluster = [&](int &indexLR) {
0193       const auto testchan = channels[testIndex];
0194       const auto testFed = chanlocs->fedID(testchan);
0195       const auto testChannel = chanlocs->fedCh(testchan);
0196       const auto testStrip = stripId[testIndex];
0197       const auto testNoise = conditions->noise(testFed, testChannel, testStrip);
0198       const auto testADC = adc[testIndex];
0199
0200       if (testADC >= static_cast<uint8_t>(testNoise * channelThreshold)) {
0201         ++size;
0202         indexLR = testIndex;
0203         noiseSquared_i += testNoise * testNoise;
0204         adcSum_i += static_cast<float>(testADC);
0205       }
0206     };
0207
0208     // find left boundary
0209     auto indexLeft = index;
0210
0211     if (testIndex >= 0 && stripId[testIndex] == invalidStrip) {
0212       testIndex -= 2;
0213     }
0214
0215     if (testIndex >= 0) {
0216       const auto testchan = channels[testIndex];
0217       const auto testDet = chanlocs->detID(testchan);
0218       auto rangeLeft = stripId[indexLeft] - stripId[testIndex] - 1;
0219       auto sameDetLeft = det == testDet;
0220
0221       while (sameDetLeft && rangeLeft >= 0 && rangeLeft <= maxSequentialHoles && size < clusterSizeLimit + 1) {
0222         addtocluster(indexLeft);
0223         --testIndex;
0224         if (testIndex >= 0 && stripId[testIndex] == invalidStrip) {
0225           testIndex -= 2;
0226         }
0227         if (testIndex >= 0) {
0228           rangeLeft = stripId[indexLeft] - stripId[testIndex] - 1;
0229           const auto newchan = channels[testIndex];
0230           const auto newdet = chanlocs->detID(newchan);
0231           sameDetLeft = det == newdet;
0232         } else {
0233           sameDetLeft = false;
0234         }
0235       }  // while loop
0236     }  // testIndex >= 0
0237
0238     // find right boundary
0239     auto indexRight = index;
0240     testIndex = index + 1;
0241
0242     if (testIndex < nStrips && stripId[testIndex] == invalidStrip) {
0243       testIndex += 2;
0244     }
0245
0246     if (testIndex < nStrips) {
0247       const auto testchan = channels[testIndex];
0248       const auto testDet = chanlocs->detID(testchan);
0249       auto rangeRight = stripId[testIndex] - stripId[indexRight] - 1;
0250       auto sameDetRight = det == testDet;
0251
0252       while (sameDetRight && rangeRight >= 0 && rangeRight <= maxSequentialHoles && size < clusterSizeLimit + 1) {
0253         addtocluster(indexRight);
0254         ++testIndex;
0255         if (testIndex < nStrips && stripId[testIndex] == invalidStrip) {
0256           testIndex += 2;
0257         }
0258         if (testIndex < nStrips) {
0259           rangeRight = stripId[testIndex] - stripId[indexRight] - 1;
0260           const auto newchan = channels[testIndex];
0261           const auto newdet = chanlocs->detID(newchan);
0262           sameDetRight = det == newdet;
0263         } else {
0264           sameDetRight = false;
0265         }
0266       }  // while loop
0267     }  // testIndex < nStrips
0268     clusterIndexLeft[i] = indexLeft;
0269     clusterSize[i] = indexRight - indexLeft + 1;
0270     clusterDetId[i] = det;
0271     firstStrip[i] = stripId[indexLeft];
0272     trueCluster[i] =
0273         (noiseSquared_i * clusterThresholdSquared <= adcSum_i * adcSum_i) and (clusterSize[i] <= clusterSizeLimit);
0274   }  // i < nSeedStripsNC
0275   if (first == 0) {
0276     clust_data_d->nClusters_ = nSeedStripsNC;
0277   }
0278 }
0279
0280 __global__ static void checkClusterConditionGPU(StripDataView *sst_data_d,
0281                                                 const ConditionsDeviceView *conditions,
0282                                                 SiStripClustersCUDADevice::DeviceView *clust_data_d) {
0283   const uint16_t *__restrict__ stripId = sst_data_d->stripId;
0284   const auto __restrict__ chanlocs = sst_data_d->chanlocs;
0285   const uint16_t *__restrict__ channels = sst_data_d->channel;
0286   const uint8_t *__restrict__ adc = sst_data_d->adc;
0287   const float minGoodCharge = sst_data_d->minGoodCharge;  //1620.0;
0288   const auto nSeedStripsNC = clust_data_d->nClusters_;
0289   const auto __restrict__ clusterIndexLeft = clust_data_d->clusterIndex_;
0290
0291   auto __restrict__ clusterSize = clust_data_d->clusterSize_;
0292   auto __restrict__ clusterADCs = clust_data_d->clusterADCs_;
0293   auto __restrict__ trueCluster = clust_data_d->trueCluster_;
0294   auto __restrict__ barycenter = clust_data_d->barycenter_;
0295   auto __restrict__ charge = clust_data_d->charge_;
0296
0297   constexpr uint16_t stripIndexMask = 0x7FFF;
0298
0299   const int tid = threadIdx.x;
0300   const int bid = blockIdx.x;
0301   const int nthreads = blockDim.x;
0302   const int first = nthreads * bid + tid;
0303   const int stride = blockDim.x * gridDim.x;
0304
0305   for (int i = first; i < nSeedStripsNC; i += stride) {
0306     if (trueCluster[i]) {
0307       const int left = clusterIndexLeft[i];
0308       const int size = clusterSize[i];
0309
0310       if (i > 0 && clusterIndexLeft[i - 1] == left) {
0311         trueCluster[i] = 0;  // ignore duplicates
0312       } else {
0313         float adcSum = 0.0f;
0314         int sumx = 0;
0315         int suma = 0;
0316
0317         auto j = 0;
0318         for (int k = 0; k < size; k++) {
0319           const auto index = left + k;
0320           const auto chan = channels[index];
0321           const auto fed = chanlocs->fedID(chan);
0322           const auto channel = chanlocs->fedCh(chan);
0323           const auto strip = stripId[index];
0324 #ifdef GPU_CHECK
0325           if (fed == invalidFed) {
0326             printf("Invalid fed index %d\n", index);
0327           }
0328 #endif
0329           if (strip != invalidStrip) {
0330             const float gain_j = conditions->gain(fed, channel, strip);
0331
0332             uint8_t adc_j = adc[index];
0333             const int charge = static_cast<int>(static_cast<float>(adc_j) / gain_j + 0.5f);
0334
0335             constexpr uint8_t adc_low_saturation = 254;
0336             constexpr uint8_t adc_high_saturation = 255;
0337             constexpr int charge_low_saturation = 253;
0338             constexpr int charge_high_saturation = 1022;
0339             if (adc_j < adc_low_saturation) {
0340               adc_j =
0341                   (charge > charge_high_saturation ? adc_high_saturation
0342                                                    : (charge > charge_low_saturation ? adc_low_saturation : charge));
0343             }
0344             clusterADCs[j * nSeedStripsNC + i] = adc_j;
0345
0346             adcSum += static_cast<float>(adc_j);
0347             sumx += j * adc_j;
0348             suma += adc_j;
0349             j++;
0350           }
0351         }  // loop over cluster strips
0352         charge[i] = adcSum;
0353         const auto chan = channels[left];
0354         const fedId_t fed = chanlocs->fedID(chan);
0355         const fedCh_t channel = chanlocs->fedCh(chan);
0356         trueCluster[i] = (adcSum * conditions->invthick(fed, channel)) > minGoodCharge;
0357         const auto bary_i = static_cast<float>(sumx) / static_cast<float>(suma);
0358         barycenter[i] = static_cast<float>(stripId[left] & stripIndexMask) + bary_i + 0.5f;
0359         clusterSize[i] = j;
0360       }  // not a duplicate cluster
0361     }  // trueCluster[i] is true
0362   }  // i < nSeedStripsNC
0363 }
0364
0365 namespace stripgpu {
0366   void SiStripRawToClusterGPUKernel::unpackChannelsGPU(const ConditionsDeviceView *conditions, cudaStream_t stream) {
0367     constexpr int nthreads = 128;
0368     const auto channels = chanlocsGPU_->size();
0369     const auto nblocks = (channels + nthreads - 1) / nthreads;
0370
0371     unpackChannels<<<nblocks, nthreads, 0, stream>>>(chanlocsGPU_->channelLocsView(),
0372                                                      conditions,
0373                                                      stripdata_->alldataGPU_.get(),
0374                                                      stripdata_->channelGPU_.get(),
0375                                                      stripdata_->stripIdGPU_.get());
0376   }
0377
0378   void SiStripRawToClusterGPUKernel::allocateSSTDataGPU(int max_strips, cudaStream_t stream) {
0379     stripdata_->seedStripsMask_ = cms::cuda::make_device_unique<int[]>(2 * max_strips, stream);
0380     stripdata_->prefixSeedStripsNCMask_ = cms::cuda::make_device_unique<int[]>(2 * max_strips, stream);
0381
0382     sst_data_d_->chanlocs = chanlocsGPU_->channelLocsView();
0383     sst_data_d_->stripId = stripdata_->stripIdGPU_.get();
0384     sst_data_d_->channel = stripdata_->channelGPU_.get();
0385     sst_data_d_->adc = stripdata_->alldataGPU_.get();
0386     sst_data_d_->seedStripsMask = stripdata_->seedStripsMask_.get();
0387     sst_data_d_->prefixSeedStripsNCMask = stripdata_->prefixSeedStripsNCMask_.get();
0388
0389     sst_data_d_->seedStripsNCMask = sst_data_d_->seedStripsMask + max_strips;
0390     sst_data_d_->seedStripsNCIndex = sst_data_d_->prefixSeedStripsNCMask + max_strips;
0391
0392     sst_data_d_->channelThreshold = channelThreshold_;
0393     sst_data_d_->seedThreshold = seedThreshold_;
0394     sst_data_d_->clusterThresholdSquared = clusterThresholdSquared_;
0395     sst_data_d_->maxSequentialHoles = maxSequentialHoles_;
0396     sst_data_d_->maxSequentialBad = maxSequentialBad_;
0397     sst_data_d_->maxAdjacentBad = maxAdjacentBad_;
0398     sst_data_d_->minGoodCharge = minGoodCharge_;
0399     sst_data_d_->clusterSizeLimit = maxClusterSize_;
0400
0401     pt_sst_data_d_ = cms::cuda::make_device_unique<StripDataView>(stream);
0402     cms::cuda::copyAsync(pt_sst_data_d_, sst_data_d_, stream);
0403 #ifdef GPU_CHECK
0404     cudaCheck(cudaStreamSynchronize(stream));
0405 #endif
0406   }
0407
0408   void SiStripRawToClusterGPUKernel::findClusterGPU(const ConditionsDeviceView *conditions, cudaStream_t stream) {
0409     const int nthreads = 128;
0410     const int nStrips = sst_data_d_->nStrips;
0411     const int nSeeds = std::min(kMaxSeedStrips, nStrips);
0412     const int nblocks = (nSeeds + nthreads - 1) / nthreads;
0413
0414 #ifdef GPU_DEBUG
0415     auto cpu_index = cms::cuda::make_host_unique<int[]>(nStrips, stream);
0416     auto cpu_strip = cms::cuda::make_host_unique<uint16_t[]>(nStrips, stream);
0417     auto cpu_adc = cms::cuda::make_host_unique<uint8_t[]>(nStrips, stream);
0418
0419     cudaCheck(cudaMemcpyAsync(
0420         cpu_strip.get(), sst_data_d_->stripId, nStrips * sizeof(uint16_t), cudaMemcpyDeviceToHost, stream));
0421     cudaCheck(
0422         cudaMemcpyAsync(cpu_adc.get(), sst_data_d_->adc, nStrips * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream));
0423     cudaCheck(cudaMemcpyAsync(
0424         cpu_index.get(), sst_data_d_->seedStripsNCIndex, nStrips * sizeof(int), cudaMemcpyDeviceToHost, stream));
0425     cudaCheck(cudaStreamSynchronize(stream));
0426
0427     for (int i = 0; i < nStrips; i++) {
0428       std::cout << " cpu_strip " << cpu_strip[i] << " cpu_adc " << (unsigned int)cpu_adc[i] << " cpu index "
0429                 << cpu_index[i] << std::endl;
0430     }
0431 #endif
0432
0433     auto clust_data_d = clusters_d_.view();
0434     findLeftRightBoundaryGPU<<<nblocks, nthreads, 0, stream>>>(pt_sst_data_d_.get(), conditions, clust_data_d);
0435     cudaCheck(cudaGetLastError());
0436 #ifdef GPU_CHECK
0437     cudaDeviceSynchronize();
0438     cudaCheck(cudaGetLastError());
0439 #endif
0440
0441     cudaCheck(cudaMemcpyAsync(clusters_d_.nClustersPtr(),
0442                               &(clust_data_d->nClusters_),
0443                               sizeof(clust_data_d->nClusters_),
0444                               cudaMemcpyDeviceToHost,
0445                               stream));
0446
0447     checkClusterConditionGPU<<<nblocks, nthreads, 0, stream>>>(pt_sst_data_d_.get(), conditions, clust_data_d);
0448     cudaCheck(cudaGetLastError());
0449
0450 #ifdef GPU_CHECK
0451     cudaDeviceSynchronize();
0452     cudaCheck(cudaGetLastError());
0453 #endif
0454
0455 #ifdef GPU_DEBUG
0456     cudaStreamSynchronize(stream);
0457     auto clust_data = std::make_unique<SiStripClustersCUDAHost>(clusters_d_, stream);
0458     cudaStreamSynchronize(stream);
0459
0460     const auto clusterIndexLeft = clust_data->clusterIndex().get();
0461     const auto clusterSize = clust_data->clusterSize().get();
0462     const auto trueCluster = clust_data->trueCluster().get();
0463     const auto clusterADCs = clust_data->clusterADCs().get();
0464     const auto detids = clust_data->clusterDetId().get();
0465     const auto charge = clust_data->charge().get();
0466
0467     const auto nSeedStripsNC = clusters_d_.nClusters();
0468     std::cout << "findClusterGPU nSeedStripsNC=" << nSeedStripsNC << std::endl;
0469
0470     for (auto i = 0U; i < nSeedStripsNC; i++) {
0471       if (trueCluster[i]) {
0472         int left = clusterIndexLeft[i];
0473         uint32_t size = clusterSize[i];
0474         const auto detid = detids[i];
0475         std::cout << "i=" << i << " detId " << detid << " left " << left << " size " << size << " charge " << charge[i]
0476                   << ": ";
0477         size = std::min(size, maxClusterSize_);
0478         for (uint32_t j = 0; j < size; j++) {
0479           std::cout << (unsigned int)clusterADCs[j * nSeedStripsNC + i] << " ";
0480         }
0481         std::cout << std::endl;
0482       }
0483     }
0484 #endif
0485   }
0486
0487   void SiStripRawToClusterGPUKernel::setSeedStripsNCIndexGPU(const ConditionsDeviceView *conditions,
0488                                                              cudaStream_t stream) {
0489 #ifdef GPU_DEBUG
0490     int nStrips = sst_data_d_->nStrips;
0491     auto cpu_strip = cms::cuda::make_host_unique<uint16_t[]>(nStrips, stream);
0492     auto cpu_adc = cms::cuda::make_host_unique<uint8_t[]>(nStrips, stream);
0493
0494     cudaCheck(cudaMemcpyAsync(
0495         cpu_strip.get(), sst_data_d_->stripId, nStrips * sizeof(uint16_t), cudaMemcpyDeviceToHost, stream));
0496     cudaCheck(
0497         cudaMemcpyAsync(cpu_adc.get(), sst_data_d_->adc, nStrips * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream));
0498     cudaCheck(cudaStreamSynchronize(stream));
0499
0500     for (int i = 0; i < nStrips; i++) {
0501       std::cout << " cpu_strip " << cpu_strip[i] << " cpu_adc " << (unsigned int)cpu_adc[i] << std::endl;
0502     }
0503 #endif
0504
0505     int nthreads = 256;
0506     int nblocks = (sst_data_d_->nStrips + nthreads - 1) / nthreads;
0507
0508     //mark seed strips
0509     setSeedStripsGPU<<<nblocks, nthreads, 0, stream>>>(pt_sst_data_d_.get(), conditions);
0510     cudaCheck(cudaGetLastError());
0511 #ifdef GPU_CHECK
0512     cudaCheck(cudaStreamSynchronize(stream));
0513 #endif
0514
0515     //mark only non-consecutive seed strips (mask out consecutive seed strips)
0516     setNCSeedStripsGPU<<<nblocks, nthreads, 0, stream>>>(pt_sst_data_d_.get(), conditions);
0517     cudaCheck(cudaGetLastError());
0518 #ifdef GPU_CHECK
0519     cudaCheck(cudaStreamSynchronize(stream));
0520 #endif
0521
0522     std::size_t temp_storage_bytes = 0;
0523     cub::DeviceScan::ExclusiveSum(nullptr,
0524                                   temp_storage_bytes,
0525                                   sst_data_d_->seedStripsNCMask,
0526                                   sst_data_d_->prefixSeedStripsNCMask,
0527                                   sst_data_d_->nStrips,
0528                                   stream);
0529 #ifdef GPU_DEBUG
0530     std::cout << "temp_storage_bytes=" << temp_storage_bytes << std::endl;
0531 #endif
0532 #ifdef GPU_CHECK
0533     cudaCheck(cudaStreamSynchronize(stream));
0534 #endif
0535
0536     {
0537       auto d_temp_storage = cms::cuda::make_device_unique<uint8_t[]>(temp_storage_bytes, stream);
0538       cub::DeviceScan::ExclusiveSum(d_temp_storage.get(),
0539                                     temp_storage_bytes,
0540                                     sst_data_d_->seedStripsNCMask,
0541                                     sst_data_d_->prefixSeedStripsNCMask,
0542                                     sst_data_d_->nStrips,
0543                                     stream);
0544     }
0545 #ifdef GPU_CHECK
0546     cudaCheck(cudaStreamSynchronize(stream));
0547 #endif
0548
0549     setStripIndexGPU<<<nblocks, nthreads, 0, stream>>>(pt_sst_data_d_.get());
0550     cudaCheck(cudaGetLastError());
0551 #ifdef GPU_CHECK
0552     cudaCheck(cudaStreamSynchronize(stream));
0553 #endif
0554
0555 #ifdef GPU_DEBUG
0556     auto cpu_mask = cms::cuda::make_host_unique<int[]>(nStrips, stream);
0557     auto cpu_prefix = cms::cuda::make_host_unique<int[]>(nStrips, stream);
0558     auto cpu_index = cms::cuda::make_host_unique<int[]>(nStrips, stream);
0559
0560     cudaCheck(cudaMemcpyAsync(&(sst_data_d_->nSeedStripsNC),
0561                               sst_data_d_->prefixSeedStripsNCMask + sst_data_d_->nStrips - 1,
0562                               sizeof(int),
0563                               cudaMemcpyDeviceToHost,
0564                               stream));
0565     cudaCheck(cudaMemcpyAsync(
0566         cpu_mask.get(), sst_data_d_->seedStripsNCMask, nStrips * sizeof(int), cudaMemcpyDeviceToHost, stream));
0567     cudaCheck(cudaMemcpyAsync(
0568         cpu_prefix.get(), sst_data_d_->prefixSeedStripsNCMask, nStrips * sizeof(int), cudaMemcpyDeviceToHost, stream));
0569     cudaCheck(cudaMemcpyAsync(
0570         cpu_index.get(), sst_data_d_->seedStripsNCIndex, nStrips * sizeof(int), cudaMemcpyDeviceToHost, stream));
0571     cudaCheck(cudaStreamSynchronize(stream));
0572
0573     const int nSeedStripsNC = std::min(kMaxSeedStrips, sst_data_d_->nSeedStripsNC);
0574     std::cout << "nStrips=" << nStrips << " nSeedStripsNC=" << sst_data_d_->nSeedStripsNC << std::endl;
0575     for (int i = 0; i < nStrips; i++) {
0576       std::cout << " i " << i << " mask " << cpu_mask[i] << " prefix " << cpu_prefix[i] << " index " << cpu_index[i]
0577                 << std::endl;
0578     }
0579 #endif
0580   }
0581 }  // namespace stripgpu