Warning, /RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cu is written in an unsupported language. File is not indexed.
0001 //#define GPU_DEBUG
0002 #if defined(EDM_ML_DEBUG) || defined(GPU_DEBUG)
0003 #define GPU_CHECK
0004 #include <cstdio>
0005 #endif
0006
0007 #include <cub/cub.cuh>
0008 // prevent _Float16 defined by CUDA headers from hiding the ISO C type used by GCC
0009 #ifdef _Float16
0010 #undef _Float16
0011 #endif
0012
0013 #include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
0014 #include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
0015 #include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
0016 #include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h"
0017 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
0018 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
0019 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0020 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
0021
0022 #include "ChannelLocsGPU.h"
0023 #include "SiStripRawToClusterGPUKernel.h"
0024 #include "StripDataView.h"
0025
0026 using namespace stripgpu;
0027 using ConditionsDeviceView = SiStripClusterizerConditionsGPU::Data::DeviceView;
0028
0029 __global__ static void unpackChannels(const ChannelLocsView *chanlocs,
0030 const ConditionsDeviceView *conditions,
0031 uint8_t *alldata,
0032 uint16_t *channel,
0033 stripId_t *stripId) {
0034 const int tid = threadIdx.x;
0035 const int bid = blockIdx.x;
0036 const int nthreads = blockDim.x;
0037
0038 const auto first = nthreads * bid + tid;
0039 const auto stride = blockDim.x * gridDim.x;
0040 for (auto chan = first; chan < chanlocs->size(); chan += stride) {
0041 const auto fedid = chanlocs->fedID(chan);
0042 const auto fedch = chanlocs->fedCh(chan);
0043 const auto ipair = conditions->iPair(fedid, fedch);
0044 const auto ipoff = sistrip::STRIPS_PER_FEDCH * ipair;
0045
0046 const auto data = chanlocs->input(chan);
0047 const auto len = chanlocs->length(chan);
0048
0049 if (data != nullptr && len > 0) {
0050 auto aoff = chanlocs->offset(chan);
0051 auto choff = chanlocs->inoff(chan);
0052 const auto end = choff + len;
0053
0054 while (choff < end) {
0055 auto stripIndex = data[(choff++) ^ 7] + ipoff;
0056 const auto groupLength = data[(choff++) ^ 7];
0057
0058 for (auto i = 0; i < 2; ++i) {
0059 stripId[aoff] = invalidStrip;
0060 alldata[aoff++] = 0;
0061 }
0062
0063 for (auto i = 0; i < groupLength; ++i) {
0064 stripId[aoff] = stripIndex++;
0065 channel[aoff] = chan;
0066 alldata[aoff++] = data[(choff++) ^ 7];
0067 }
0068 }
0069 } // choff < end
0070 } // data != nullptr && len > 0
0071 } // chan < chanlocs->size()
0072
0073 __global__ static void setSeedStripsGPU(StripDataView *sst_data_d, const ConditionsDeviceView *conditions) {
0074 const int nStrips = sst_data_d->nStrips;
0075 const auto __restrict__ chanlocs = sst_data_d->chanlocs;
0076 const uint8_t *__restrict__ adc = sst_data_d->adc;
0077 const uint16_t *__restrict__ channels = sst_data_d->channel;
0078 const uint16_t *__restrict__ stripId = sst_data_d->stripId;
0079 int *__restrict__ seedStripsMask = sst_data_d->seedStripsMask;
0080 int *__restrict__ seedStripsNCMask = sst_data_d->seedStripsNCMask;
0081 const float seedThreshold = sst_data_d->seedThreshold;
0082
0083 const int tid = threadIdx.x;
0084 const int bid = blockIdx.x;
0085 const int nthreads = blockDim.x;
0086 const int first = nthreads * bid + tid;
0087 const int stride = blockDim.x * gridDim.x;
0088
0089 for (int i = first; i < nStrips; i += stride) {
0090 seedStripsMask[i] = 0;
0091 seedStripsNCMask[i] = 0;
0092 const stripId_t strip = stripId[i];
0093 if (strip != invalidStrip) {
0094 const auto chan = channels[i];
0095 const fedId_t fed = chanlocs->fedID(chan);
0096 const fedCh_t channel = chanlocs->fedCh(chan);
0097 const float noise_i = conditions->noise(fed, channel, strip);
0098 const uint8_t adc_i = adc[i];
0099
0100 seedStripsMask[i] = (adc_i >= static_cast<uint8_t>(noise_i * seedThreshold)) ? 1 : 0;
0101 seedStripsNCMask[i] = seedStripsMask[i];
0102 }
0103 }
0104 }
0105
0106 __global__ static void setNCSeedStripsGPU(StripDataView *sst_data_d, const ConditionsDeviceView *conditions) {
0107 const int nStrips = sst_data_d->nStrips;
0108 const auto __restrict__ chanlocs = sst_data_d->chanlocs;
0109 const uint16_t *__restrict__ channels = sst_data_d->channel;
0110 const uint16_t *__restrict__ stripId = sst_data_d->stripId;
0111 const int *__restrict__ seedStripsMask = sst_data_d->seedStripsMask;
0112 int *__restrict__ seedStripsNCMask = sst_data_d->seedStripsNCMask;
0113
0114 const int tid = threadIdx.x;
0115 const int bid = blockIdx.x;
0116 const int nthreads = blockDim.x;
0117 const int first = nthreads * bid + tid;
0118 const int stride = blockDim.x * gridDim.x;
0119
0120 for (int i = first; i < nStrips; i += stride) {
0121 if (i > 0) {
0122 const auto detid = chanlocs->detID(channels[i]);
0123 const auto detid1 = chanlocs->detID(channels[i - 1]);
0124
0125 if (seedStripsMask[i] && seedStripsMask[i - 1] && (stripId[i] - stripId[i - 1]) == 1 && (detid == detid1))
0126 seedStripsNCMask[i] = 0;
0127 }
0128 }
0129 }
0130
0131 __global__ static void setStripIndexGPU(StripDataView *sst_data_d) {
0132 const int nStrips = sst_data_d->nStrips;
0133 const int *__restrict__ seedStripsNCMask = sst_data_d->seedStripsNCMask;
0134 const int *__restrict__ prefixSeedStripsNCMask = sst_data_d->prefixSeedStripsNCMask;
0135 int *__restrict__ seedStripsNCIndex = sst_data_d->seedStripsNCIndex;
0136
0137 const int tid = threadIdx.x;
0138 const int bid = blockIdx.x;
0139 const int nthreads = blockDim.x;
0140 const int first = nthreads * bid + tid;
0141 const int stride = blockDim.x * gridDim.x;
0142
0143 for (int i = first; i < nStrips; i += stride) {
0144 if (seedStripsNCMask[i] == 1) {
0145 const int index = prefixSeedStripsNCMask[i];
0146 seedStripsNCIndex[index] = i;
0147 }
0148 }
0149 }
0150
0151 __global__ static void findLeftRightBoundaryGPU(const StripDataView *sst_data_d,
0152 const ConditionsDeviceView *conditions,
0153 SiStripClustersCUDADevice::DeviceView *clust_data_d) {
0154 const int nStrips = sst_data_d->nStrips;
0155 const int *__restrict__ seedStripsNCIndex = sst_data_d->seedStripsNCIndex;
0156 const auto __restrict__ chanlocs = sst_data_d->chanlocs;
0157 const uint16_t *__restrict__ stripId = sst_data_d->stripId;
0158 const uint16_t *__restrict__ channels = sst_data_d->channel;
0159 const uint8_t *__restrict__ adc = sst_data_d->adc;
0160 const int nSeedStripsNC = std::min(kMaxSeedStrips, *(sst_data_d->prefixSeedStripsNCMask + nStrips - 1));
0161 const uint8_t maxSequentialHoles = sst_data_d->maxSequentialHoles;
0162 const float channelThreshold = sst_data_d->channelThreshold;
0163 const float clusterThresholdSquared = sst_data_d->clusterThresholdSquared;
0164 const int clusterSizeLimit = sst_data_d->clusterSizeLimit;
0165
0166 auto __restrict__ clusterIndexLeft = clust_data_d->clusterIndex_;
0167 auto __restrict__ clusterSize = clust_data_d->clusterSize_;
0168 auto __restrict__ clusterDetId = clust_data_d->clusterDetId_;
0169 auto __restrict__ firstStrip = clust_data_d->firstStrip_;
0170 auto __restrict__ trueCluster = clust_data_d->trueCluster_;
0171
0172 const int tid = threadIdx.x;
0173 const int bid = blockIdx.x;
0174 const int nthreads = blockDim.x;
0175 const int first = nthreads * bid + tid;
0176 const int stride = blockDim.x * gridDim.x;
0177
0178 for (int i = first; i < nSeedStripsNC; i += stride) {
0179 const auto index = seedStripsNCIndex[i];
0180 const auto chan = channels[index];
0181 const auto fed = chanlocs->fedID(chan);
0182 const auto channel = chanlocs->fedCh(chan);
0183 const auto det = chanlocs->detID(chan);
0184 const auto strip = stripId[index];
0185 const auto noise_i = conditions->noise(fed, channel, strip);
0186
0187 auto noiseSquared_i = noise_i * noise_i;
0188 float adcSum_i = static_cast<float>(adc[index]);
0189 auto testIndex = index - 1;
0190 auto size = 1;
0191
0192 auto addtocluster = [&](int &indexLR) {
0193 const auto testchan = channels[testIndex];
0194 const auto testFed = chanlocs->fedID(testchan);
0195 const auto testChannel = chanlocs->fedCh(testchan);
0196 const auto testStrip = stripId[testIndex];
0197 const auto testNoise = conditions->noise(testFed, testChannel, testStrip);
0198 const auto testADC = adc[testIndex];
0199
0200 if (testADC >= static_cast<uint8_t>(testNoise * channelThreshold)) {
0201 ++size;
0202 indexLR = testIndex;
0203 noiseSquared_i += testNoise * testNoise;
0204 adcSum_i += static_cast<float>(testADC);
0205 }
0206 };
0207
0208 // find left boundary
0209 auto indexLeft = index;
0210
0211 if (testIndex >= 0 && stripId[testIndex] == invalidStrip) {
0212 testIndex -= 2;
0213 }
0214
0215 if (testIndex >= 0) {
0216 const auto testchan = channels[testIndex];
0217 const auto testDet = chanlocs->detID(testchan);
0218 auto rangeLeft = stripId[indexLeft] - stripId[testIndex] - 1;
0219 auto sameDetLeft = det == testDet;
0220
0221 while (sameDetLeft && rangeLeft >= 0 && rangeLeft <= maxSequentialHoles && size < clusterSizeLimit + 1) {
0222 addtocluster(indexLeft);
0223 --testIndex;
0224 if (testIndex >= 0 && stripId[testIndex] == invalidStrip) {
0225 testIndex -= 2;
0226 }
0227 if (testIndex >= 0) {
0228 rangeLeft = stripId[indexLeft] - stripId[testIndex] - 1;
0229 const auto newchan = channels[testIndex];
0230 const auto newdet = chanlocs->detID(newchan);
0231 sameDetLeft = det == newdet;
0232 } else {
0233 sameDetLeft = false;
0234 }
0235 } // while loop
0236 } // testIndex >= 0
0237
0238 // find right boundary
0239 auto indexRight = index;
0240 testIndex = index + 1;
0241
0242 if (testIndex < nStrips && stripId[testIndex] == invalidStrip) {
0243 testIndex += 2;
0244 }
0245
0246 if (testIndex < nStrips) {
0247 const auto testchan = channels[testIndex];
0248 const auto testDet = chanlocs->detID(testchan);
0249 auto rangeRight = stripId[testIndex] - stripId[indexRight] - 1;
0250 auto sameDetRight = det == testDet;
0251
0252 while (sameDetRight && rangeRight >= 0 && rangeRight <= maxSequentialHoles && size < clusterSizeLimit + 1) {
0253 addtocluster(indexRight);
0254 ++testIndex;
0255 if (testIndex < nStrips && stripId[testIndex] == invalidStrip) {
0256 testIndex += 2;
0257 }
0258 if (testIndex < nStrips) {
0259 rangeRight = stripId[testIndex] - stripId[indexRight] - 1;
0260 const auto newchan = channels[testIndex];
0261 const auto newdet = chanlocs->detID(newchan);
0262 sameDetRight = det == newdet;
0263 } else {
0264 sameDetRight = false;
0265 }
0266 } // while loop
0267 } // testIndex < nStrips
0268 clusterIndexLeft[i] = indexLeft;
0269 clusterSize[i] = indexRight - indexLeft + 1;
0270 clusterDetId[i] = det;
0271 firstStrip[i] = stripId[indexLeft];
0272 trueCluster[i] =
0273 (noiseSquared_i * clusterThresholdSquared <= adcSum_i * adcSum_i) and (clusterSize[i] <= clusterSizeLimit);
0274 } // i < nSeedStripsNC
0275 if (first == 0) {
0276 clust_data_d->nClusters_ = nSeedStripsNC;
0277 }
0278 }
0279
0280 __global__ static void checkClusterConditionGPU(StripDataView *sst_data_d,
0281 const ConditionsDeviceView *conditions,
0282 SiStripClustersCUDADevice::DeviceView *clust_data_d) {
0283 const uint16_t *__restrict__ stripId = sst_data_d->stripId;
0284 const auto __restrict__ chanlocs = sst_data_d->chanlocs;
0285 const uint16_t *__restrict__ channels = sst_data_d->channel;
0286 const uint8_t *__restrict__ adc = sst_data_d->adc;
0287 const float minGoodCharge = sst_data_d->minGoodCharge; //1620.0;
0288 const auto nSeedStripsNC = clust_data_d->nClusters_;
0289 const auto __restrict__ clusterIndexLeft = clust_data_d->clusterIndex_;
0290
0291 auto __restrict__ clusterSize = clust_data_d->clusterSize_;
0292 auto __restrict__ clusterADCs = clust_data_d->clusterADCs_;
0293 auto __restrict__ trueCluster = clust_data_d->trueCluster_;
0294 auto __restrict__ barycenter = clust_data_d->barycenter_;
0295 auto __restrict__ charge = clust_data_d->charge_;
0296
0297 constexpr uint16_t stripIndexMask = 0x7FFF;
0298
0299 const int tid = threadIdx.x;
0300 const int bid = blockIdx.x;
0301 const int nthreads = blockDim.x;
0302 const int first = nthreads * bid + tid;
0303 const int stride = blockDim.x * gridDim.x;
0304
0305 for (int i = first; i < nSeedStripsNC; i += stride) {
0306 if (trueCluster[i]) {
0307 const int left = clusterIndexLeft[i];
0308 const int size = clusterSize[i];
0309
0310 if (i > 0 && clusterIndexLeft[i - 1] == left) {
0311 trueCluster[i] = 0; // ignore duplicates
0312 } else {
0313 float adcSum = 0.0f;
0314 int sumx = 0;
0315 int suma = 0;
0316
0317 auto j = 0;
0318 for (int k = 0; k < size; k++) {
0319 const auto index = left + k;
0320 const auto chan = channels[index];
0321 const auto fed = chanlocs->fedID(chan);
0322 const auto channel = chanlocs->fedCh(chan);
0323 const auto strip = stripId[index];
0324 #ifdef GPU_CHECK
0325 if (fed == invalidFed) {
0326 printf("Invalid fed index %d\n", index);
0327 }
0328 #endif
0329 if (strip != invalidStrip) {
0330 const float gain_j = conditions->gain(fed, channel, strip);
0331
0332 uint8_t adc_j = adc[index];
0333 const int charge = static_cast<int>(static_cast<float>(adc_j) / gain_j + 0.5f);
0334
0335 constexpr uint8_t adc_low_saturation = 254;
0336 constexpr uint8_t adc_high_saturation = 255;
0337 constexpr int charge_low_saturation = 253;
0338 constexpr int charge_high_saturation = 1022;
0339 if (adc_j < adc_low_saturation) {
0340 adc_j =
0341 (charge > charge_high_saturation ? adc_high_saturation
0342 : (charge > charge_low_saturation ? adc_low_saturation : charge));
0343 }
0344 clusterADCs[j * nSeedStripsNC + i] = adc_j;
0345
0346 adcSum += static_cast<float>(adc_j);
0347 sumx += j * adc_j;
0348 suma += adc_j;
0349 j++;
0350 }
0351 } // loop over cluster strips
0352 charge[i] = adcSum;
0353 const auto chan = channels[left];
0354 const fedId_t fed = chanlocs->fedID(chan);
0355 const fedCh_t channel = chanlocs->fedCh(chan);
0356 trueCluster[i] = (adcSum * conditions->invthick(fed, channel)) > minGoodCharge;
0357 const auto bary_i = static_cast<float>(sumx) / static_cast<float>(suma);
0358 barycenter[i] = static_cast<float>(stripId[left] & stripIndexMask) + bary_i + 0.5f;
0359 clusterSize[i] = j;
0360 } // not a duplicate cluster
0361 } // trueCluster[i] is true
0362 } // i < nSeedStripsNC
0363 }
0364
0365 namespace stripgpu {
0366 void SiStripRawToClusterGPUKernel::unpackChannelsGPU(const ConditionsDeviceView *conditions, cudaStream_t stream) {
0367 constexpr int nthreads = 128;
0368 const auto channels = chanlocsGPU_->size();
0369 const auto nblocks = (channels + nthreads - 1) / nthreads;
0370
0371 unpackChannels<<<nblocks, nthreads, 0, stream>>>(chanlocsGPU_->channelLocsView(),
0372 conditions,
0373 stripdata_->alldataGPU_.get(),
0374 stripdata_->channelGPU_.get(),
0375 stripdata_->stripIdGPU_.get());
0376 }
0377
0378 void SiStripRawToClusterGPUKernel::allocateSSTDataGPU(int max_strips, cudaStream_t stream) {
0379 stripdata_->seedStripsMask_ = cms::cuda::make_device_unique<int[]>(2 * max_strips, stream);
0380 stripdata_->prefixSeedStripsNCMask_ = cms::cuda::make_device_unique<int[]>(2 * max_strips, stream);
0381
0382 sst_data_d_->chanlocs = chanlocsGPU_->channelLocsView();
0383 sst_data_d_->stripId = stripdata_->stripIdGPU_.get();
0384 sst_data_d_->channel = stripdata_->channelGPU_.get();
0385 sst_data_d_->adc = stripdata_->alldataGPU_.get();
0386 sst_data_d_->seedStripsMask = stripdata_->seedStripsMask_.get();
0387 sst_data_d_->prefixSeedStripsNCMask = stripdata_->prefixSeedStripsNCMask_.get();
0388
0389 sst_data_d_->seedStripsNCMask = sst_data_d_->seedStripsMask + max_strips;
0390 sst_data_d_->seedStripsNCIndex = sst_data_d_->prefixSeedStripsNCMask + max_strips;
0391
0392 sst_data_d_->channelThreshold = channelThreshold_;
0393 sst_data_d_->seedThreshold = seedThreshold_;
0394 sst_data_d_->clusterThresholdSquared = clusterThresholdSquared_;
0395 sst_data_d_->maxSequentialHoles = maxSequentialHoles_;
0396 sst_data_d_->maxSequentialBad = maxSequentialBad_;
0397 sst_data_d_->maxAdjacentBad = maxAdjacentBad_;
0398 sst_data_d_->minGoodCharge = minGoodCharge_;
0399 sst_data_d_->clusterSizeLimit = maxClusterSize_;
0400
0401 pt_sst_data_d_ = cms::cuda::make_device_unique<StripDataView>(stream);
0402 cms::cuda::copyAsync(pt_sst_data_d_, sst_data_d_, stream);
0403 #ifdef GPU_CHECK
0404 cudaCheck(cudaStreamSynchronize(stream));
0405 #endif
0406 }
0407
0408 void SiStripRawToClusterGPUKernel::findClusterGPU(const ConditionsDeviceView *conditions, cudaStream_t stream) {
0409 const int nthreads = 128;
0410 const int nStrips = sst_data_d_->nStrips;
0411 const int nSeeds = std::min(kMaxSeedStrips, nStrips);
0412 const int nblocks = (nSeeds + nthreads - 1) / nthreads;
0413
0414 #ifdef GPU_DEBUG
0415 auto cpu_index = cms::cuda::make_host_unique<int[]>(nStrips, stream);
0416 auto cpu_strip = cms::cuda::make_host_unique<uint16_t[]>(nStrips, stream);
0417 auto cpu_adc = cms::cuda::make_host_unique<uint8_t[]>(nStrips, stream);
0418
0419 cudaCheck(cudaMemcpyAsync(
0420 cpu_strip.get(), sst_data_d_->stripId, nStrips * sizeof(uint16_t), cudaMemcpyDeviceToHost, stream));
0421 cudaCheck(
0422 cudaMemcpyAsync(cpu_adc.get(), sst_data_d_->adc, nStrips * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream));
0423 cudaCheck(cudaMemcpyAsync(
0424 cpu_index.get(), sst_data_d_->seedStripsNCIndex, nStrips * sizeof(int), cudaMemcpyDeviceToHost, stream));
0425 cudaCheck(cudaStreamSynchronize(stream));
0426
0427 for (int i = 0; i < nStrips; i++) {
0428 std::cout << " cpu_strip " << cpu_strip[i] << " cpu_adc " << (unsigned int)cpu_adc[i] << " cpu index "
0429 << cpu_index[i] << std::endl;
0430 }
0431 #endif
0432
0433 auto clust_data_d = clusters_d_.view();
0434 findLeftRightBoundaryGPU<<<nblocks, nthreads, 0, stream>>>(pt_sst_data_d_.get(), conditions, clust_data_d);
0435 cudaCheck(cudaGetLastError());
0436 #ifdef GPU_CHECK
0437 cudaDeviceSynchronize();
0438 cudaCheck(cudaGetLastError());
0439 #endif
0440
0441 cudaCheck(cudaMemcpyAsync(clusters_d_.nClustersPtr(),
0442 &(clust_data_d->nClusters_),
0443 sizeof(clust_data_d->nClusters_),
0444 cudaMemcpyDeviceToHost,
0445 stream));
0446
0447 checkClusterConditionGPU<<<nblocks, nthreads, 0, stream>>>(pt_sst_data_d_.get(), conditions, clust_data_d);
0448 cudaCheck(cudaGetLastError());
0449
0450 #ifdef GPU_CHECK
0451 cudaDeviceSynchronize();
0452 cudaCheck(cudaGetLastError());
0453 #endif
0454
0455 #ifdef GPU_DEBUG
0456 cudaStreamSynchronize(stream);
0457 auto clust_data = std::make_unique<SiStripClustersCUDAHost>(clusters_d_, stream);
0458 cudaStreamSynchronize(stream);
0459
0460 const auto clusterIndexLeft = clust_data->clusterIndex().get();
0461 const auto clusterSize = clust_data->clusterSize().get();
0462 const auto trueCluster = clust_data->trueCluster().get();
0463 const auto clusterADCs = clust_data->clusterADCs().get();
0464 const auto detids = clust_data->clusterDetId().get();
0465 const auto charge = clust_data->charge().get();
0466
0467 const auto nSeedStripsNC = clusters_d_.nClusters();
0468 std::cout << "findClusterGPU nSeedStripsNC=" << nSeedStripsNC << std::endl;
0469
0470 for (auto i = 0U; i < nSeedStripsNC; i++) {
0471 if (trueCluster[i]) {
0472 int left = clusterIndexLeft[i];
0473 uint32_t size = clusterSize[i];
0474 const auto detid = detids[i];
0475 std::cout << "i=" << i << " detId " << detid << " left " << left << " size " << size << " charge " << charge[i]
0476 << ": ";
0477 size = std::min(size, maxClusterSize_);
0478 for (uint32_t j = 0; j < size; j++) {
0479 std::cout << (unsigned int)clusterADCs[j * nSeedStripsNC + i] << " ";
0480 }
0481 std::cout << std::endl;
0482 }
0483 }
0484 #endif
0485 }
0486
0487 void SiStripRawToClusterGPUKernel::setSeedStripsNCIndexGPU(const ConditionsDeviceView *conditions,
0488 cudaStream_t stream) {
0489 #ifdef GPU_DEBUG
0490 int nStrips = sst_data_d_->nStrips;
0491 auto cpu_strip = cms::cuda::make_host_unique<uint16_t[]>(nStrips, stream);
0492 auto cpu_adc = cms::cuda::make_host_unique<uint8_t[]>(nStrips, stream);
0493
0494 cudaCheck(cudaMemcpyAsync(
0495 cpu_strip.get(), sst_data_d_->stripId, nStrips * sizeof(uint16_t), cudaMemcpyDeviceToHost, stream));
0496 cudaCheck(
0497 cudaMemcpyAsync(cpu_adc.get(), sst_data_d_->adc, nStrips * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream));
0498 cudaCheck(cudaStreamSynchronize(stream));
0499
0500 for (int i = 0; i < nStrips; i++) {
0501 std::cout << " cpu_strip " << cpu_strip[i] << " cpu_adc " << (unsigned int)cpu_adc[i] << std::endl;
0502 }
0503 #endif
0504
0505 int nthreads = 256;
0506 int nblocks = (sst_data_d_->nStrips + nthreads - 1) / nthreads;
0507
0508 //mark seed strips
0509 setSeedStripsGPU<<<nblocks, nthreads, 0, stream>>>(pt_sst_data_d_.get(), conditions);
0510 cudaCheck(cudaGetLastError());
0511 #ifdef GPU_CHECK
0512 cudaCheck(cudaStreamSynchronize(stream));
0513 #endif
0514
0515 //mark only non-consecutive seed strips (mask out consecutive seed strips)
0516 setNCSeedStripsGPU<<<nblocks, nthreads, 0, stream>>>(pt_sst_data_d_.get(), conditions);
0517 cudaCheck(cudaGetLastError());
0518 #ifdef GPU_CHECK
0519 cudaCheck(cudaStreamSynchronize(stream));
0520 #endif
0521
0522 std::size_t temp_storage_bytes = 0;
0523 cub::DeviceScan::ExclusiveSum(nullptr,
0524 temp_storage_bytes,
0525 sst_data_d_->seedStripsNCMask,
0526 sst_data_d_->prefixSeedStripsNCMask,
0527 sst_data_d_->nStrips,
0528 stream);
0529 #ifdef GPU_DEBUG
0530 std::cout << "temp_storage_bytes=" << temp_storage_bytes << std::endl;
0531 #endif
0532 #ifdef GPU_CHECK
0533 cudaCheck(cudaStreamSynchronize(stream));
0534 #endif
0535
0536 {
0537 auto d_temp_storage = cms::cuda::make_device_unique<uint8_t[]>(temp_storage_bytes, stream);
0538 cub::DeviceScan::ExclusiveSum(d_temp_storage.get(),
0539 temp_storage_bytes,
0540 sst_data_d_->seedStripsNCMask,
0541 sst_data_d_->prefixSeedStripsNCMask,
0542 sst_data_d_->nStrips,
0543 stream);
0544 }
0545 #ifdef GPU_CHECK
0546 cudaCheck(cudaStreamSynchronize(stream));
0547 #endif
0548
0549 setStripIndexGPU<<<nblocks, nthreads, 0, stream>>>(pt_sst_data_d_.get());
0550 cudaCheck(cudaGetLastError());
0551 #ifdef GPU_CHECK
0552 cudaCheck(cudaStreamSynchronize(stream));
0553 #endif
0554
0555 #ifdef GPU_DEBUG
0556 auto cpu_mask = cms::cuda::make_host_unique<int[]>(nStrips, stream);
0557 auto cpu_prefix = cms::cuda::make_host_unique<int[]>(nStrips, stream);
0558 auto cpu_index = cms::cuda::make_host_unique<int[]>(nStrips, stream);
0559
0560 cudaCheck(cudaMemcpyAsync(&(sst_data_d_->nSeedStripsNC),
0561 sst_data_d_->prefixSeedStripsNCMask + sst_data_d_->nStrips - 1,
0562 sizeof(int),
0563 cudaMemcpyDeviceToHost,
0564 stream));
0565 cudaCheck(cudaMemcpyAsync(
0566 cpu_mask.get(), sst_data_d_->seedStripsNCMask, nStrips * sizeof(int), cudaMemcpyDeviceToHost, stream));
0567 cudaCheck(cudaMemcpyAsync(
0568 cpu_prefix.get(), sst_data_d_->prefixSeedStripsNCMask, nStrips * sizeof(int), cudaMemcpyDeviceToHost, stream));
0569 cudaCheck(cudaMemcpyAsync(
0570 cpu_index.get(), sst_data_d_->seedStripsNCIndex, nStrips * sizeof(int), cudaMemcpyDeviceToHost, stream));
0571 cudaCheck(cudaStreamSynchronize(stream));
0572
0573 const int nSeedStripsNC = std::min(kMaxSeedStrips, sst_data_d_->nSeedStripsNC);
0574 std::cout << "nStrips=" << nStrips << " nSeedStripsNC=" << sst_data_d_->nSeedStripsNC << std::endl;
0575 for (int i = 0; i < nStrips; i++) {
0576 std::cout << " i " << i << " mask " << cpu_mask[i] << " prefix " << cpu_prefix[i] << " index " << cpu_index[i]
0577 << std::endl;
0578 }
0579 #endif
0580 }
0581 } // namespace stripgpu