EcalRecProducers/plugins/EcalUncalibRecHitMultiFitAlgoGPU.cu

0001 #include <iostream>
0002 #include <limits>
0003
0004 #include <cuda.h>
0005
0006 #include "CondFormats/EcalObjects/interface/EcalMGPAGainRatio.h"
0007 #include "CondFormats/EcalObjects/interface/EcalPedestals.h"
0008 #include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h"
0009 #include "CondFormats/EcalObjects/interface/EcalPulseShapes.h"
0010 #include "CondFormats/EcalObjects/interface/EcalSampleMask.h"
0011 #include "CondFormats/EcalObjects/interface/EcalSamplesCorrelation.h"
0012 #include "CondFormats/EcalObjects/interface/EcalXtalGroupId.h"
0013 #include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
0014 #include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
0015
0016 #include "AmplitudeComputationCommonKernels.h"
0017 #include "AmplitudeComputationKernels.h"
0018 #include "EcalUncalibRecHitMultiFitAlgoGPU.h"
0019 #include "TimeComputationKernels.h"
0020
0021 //#define DEBUG
0022
0023 //#define ECAL_RECO_CUDA_DEBUG
0024
0025 namespace ecal {
0026   namespace multifit {
0027
0028     void entryPoint(EventInputDataGPU const& eventInputGPU,
0029                     EventOutputDataGPU& eventOutputGPU,
0030                     EventDataForScratchGPU& scratch,
0031                     ConditionsProducts const& conditions,
0032                     ConfigurationParameters const& configParameters,
0033                     cudaStream_t cudaStream) {
0034       using digis_type = std::vector<uint16_t>;
0035       using dids_type = std::vector<uint32_t>;
0036       // accodring to the cpu setup  //----> hardcoded
0037       bool const gainSwitchUseMaxSampleEB = true;
0038       // accodring to the cpu setup  //----> hardcoded
0039       bool const gainSwitchUseMaxSampleEE = false;
0040
0041       uint32_t const offsetForHashes = conditions.offsetForHashes;
0042       uint32_t const offsetForInputs = eventInputGPU.ebDigis.size;
0043       unsigned int totalChannels = eventInputGPU.ebDigis.size + eventInputGPU.eeDigis.size;
0044
0045       //
0046       // 1d preparation kernel
0047       //
0048       unsigned int nchannels_per_block = 32;
0049       unsigned int threads_1d = 10 * nchannels_per_block;
0050       unsigned int blocks_1d = threads_1d > 10 * totalChannels ? 1 : (totalChannels * 10 + threads_1d - 1) / threads_1d;
0051       int shared_bytes = nchannels_per_block * EcalDataFrame::MAXSAMPLES *
0052                          (sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(bool) + sizeof(char) + sizeof(bool));
0053       kernel_prep_1d_and_initialize<<<blocks_1d, threads_1d, shared_bytes, cudaStream>>>(
0054           conditions.pulseShapes.values,
0055           eventInputGPU.ebDigis.data.get(),
0056           eventInputGPU.ebDigis.ids.get(),
0057           eventInputGPU.eeDigis.data.get(),
0058           eventInputGPU.eeDigis.ids.get(),
0059           (SampleVector*)scratch.samples.get(),
0060           (SampleVector*)eventOutputGPU.recHitsEB.amplitudesAll.get(),
0061           (SampleVector*)eventOutputGPU.recHitsEE.amplitudesAll.get(),
0062           (SampleGainVector*)scratch.gainsNoise.get(),
0063           conditions.pedestals.mean_x1,
0064           conditions.pedestals.mean_x12,
0065           conditions.pedestals.rms_x12,
0066           conditions.pedestals.mean_x6,
0067           conditions.gainRatios.gain6Over1,
0068           conditions.gainRatios.gain12Over6,
0069           scratch.hasSwitchToGain6.get(),
0070           scratch.hasSwitchToGain1.get(),
0071           scratch.isSaturated.get(),
0072           eventOutputGPU.recHitsEB.amplitude.get(),
0073           eventOutputGPU.recHitsEE.amplitude.get(),
0074           eventOutputGPU.recHitsEB.chi2.get(),
0075           eventOutputGPU.recHitsEE.chi2.get(),
0076           eventOutputGPU.recHitsEB.pedestal.get(),
0077           eventOutputGPU.recHitsEE.pedestal.get(),
0078           eventOutputGPU.recHitsEB.did.get(),
0079           eventOutputGPU.recHitsEE.did.get(),
0080           eventOutputGPU.recHitsEB.flags.get(),
0081           eventOutputGPU.recHitsEE.flags.get(),
0082           scratch.acState.get(),
0083           (BXVectorType*)scratch.activeBXs.get(),
0084           offsetForHashes,
0085           offsetForInputs,
0086           gainSwitchUseMaxSampleEB,
0087           gainSwitchUseMaxSampleEE,
0088           totalChannels);
0089       cudaCheck(cudaGetLastError());
0090
0091       //
0092       // 2d preparation kernel
0093       //
0094       int blocks_2d = totalChannels;
0095       dim3 threads_2d{10, 10};
0096       kernel_prep_2d<<<blocks_2d, threads_2d, 0, cudaStream>>>((SampleGainVector*)scratch.gainsNoise.get(),
0097                                                                eventInputGPU.ebDigis.ids.get(),
0098                                                                eventInputGPU.eeDigis.ids.get(),
0099                                                                conditions.pedestals.rms_x12,
0100                                                                conditions.pedestals.rms_x6,
0101                                                                conditions.pedestals.rms_x1,
0102                                                                conditions.gainRatios.gain12Over6,
0103                                                                conditions.gainRatios.gain6Over1,
0104                                                                conditions.samplesCorrelation.EBG12SamplesCorrelation,
0105                                                                conditions.samplesCorrelation.EBG6SamplesCorrelation,
0106                                                                conditions.samplesCorrelation.EBG1SamplesCorrelation,
0107                                                                conditions.samplesCorrelation.EEG12SamplesCorrelation,
0108                                                                conditions.samplesCorrelation.EEG6SamplesCorrelation,
0109                                                                conditions.samplesCorrelation.EEG1SamplesCorrelation,
0110                                                                (SampleMatrix*)scratch.noisecov.get(),
0111                                                                (PulseMatrixType*)scratch.pulse_matrix.get(),
0112                                                                conditions.pulseShapes.values,
0113                                                                scratch.hasSwitchToGain6.get(),
0114                                                                scratch.hasSwitchToGain1.get(),
0115                                                                scratch.isSaturated.get(),
0116                                                                offsetForHashes,
0117                                                                offsetForInputs);
0118       cudaCheck(cudaGetLastError());
0119
0120       // run minimization kernels
0121       v1::minimization_procedure(eventInputGPU, eventOutputGPU, scratch, conditions, configParameters, cudaStream);
0122
0123       if (configParameters.shouldRunTimingComputation) {
0124         //
0125         // TODO: this guy can run concurrently with other kernels,
0126         // there is no dependence on the order of execution
0127         //
0128         unsigned int threads_time_init = threads_1d;
0129         unsigned int blocks_time_init = blocks_1d;
0130         int sharedBytesInit = 2 * threads_time_init * sizeof(SampleVector::Scalar);
0131         kernel_time_computation_init<<<blocks_time_init, threads_time_init, sharedBytesInit, cudaStream>>>(
0132             eventInputGPU.ebDigis.data.get(),
0133             eventInputGPU.ebDigis.ids.get(),
0134             eventInputGPU.eeDigis.data.get(),
0135             eventInputGPU.eeDigis.ids.get(),
0136             conditions.pedestals.rms_x12,
0137             conditions.pedestals.rms_x6,
0138             conditions.pedestals.rms_x1,
0139             conditions.pedestals.mean_x12,
0140             conditions.pedestals.mean_x6,
0141             conditions.pedestals.mean_x1,
0142             conditions.gainRatios.gain12Over6,
0143             conditions.gainRatios.gain6Over1,
0144             scratch.sample_values.get(),
0145             scratch.sample_value_errors.get(),
0146             scratch.ampMaxError.get(),
0147             scratch.useless_sample_values.get(),
0148             scratch.pedestal_nums.get(),
0149             offsetForHashes,
0150             offsetForInputs,
0151             conditions.sampleMask.getEcalSampleMaskRecordEB(),
0152             conditions.sampleMask.getEcalSampleMaskRecordEE(),
0153             totalChannels);
0154         cudaCheck(cudaGetLastError());
0155
0156         //
0157         // TODO: small kernel only for EB. It needs to be checked if
0158         /// fusing such small kernels is beneficial in here
0159         //
0160         // we are running only over EB digis
0161         // therefore we need to create threads/blocks only for that
0162         unsigned int const threadsFixMGPA = threads_1d;
0163         unsigned int const blocksFixMGPA =
0164             threadsFixMGPA > 10 * eventInputGPU.ebDigis.size
0165                 ? 1
0166                 : (10 * eventInputGPU.ebDigis.size + threadsFixMGPA - 1) / threadsFixMGPA;
0167         kernel_time_compute_fixMGPAslew<<<blocksFixMGPA, threadsFixMGPA, 0, cudaStream>>>(
0168             eventInputGPU.ebDigis.data.get(),
0169             eventInputGPU.eeDigis.data.get(),
0170             scratch.sample_values.get(),
0171             scratch.sample_value_errors.get(),
0172             scratch.useless_sample_values.get(),
0173             conditions.sampleMask.getEcalSampleMaskRecordEB(),
0174             totalChannels,
0175             offsetForInputs);
0176         cudaCheck(cudaGetLastError());
0177
0178         int sharedBytes = EcalDataFrame::MAXSAMPLES * nchannels_per_block * 4 * sizeof(SampleVector::Scalar);
0179         auto const threads_nullhypot = threads_1d;
0180         auto const blocks_nullhypot = blocks_1d;
0181         kernel_time_compute_nullhypot<<<blocks_nullhypot, threads_nullhypot, sharedBytes, cudaStream>>>(
0182             scratch.sample_values.get(),
0183             scratch.sample_value_errors.get(),
0184             scratch.useless_sample_values.get(),
0185             scratch.chi2sNullHypot.get(),
0186             scratch.sum0sNullHypot.get(),
0187             scratch.sumAAsNullHypot.get(),
0188             totalChannels);
0189         cudaCheck(cudaGetLastError());
0190
0191         unsigned int nchannels_per_block_makeratio = 10;
0192         unsigned int threads_makeratio = 45 * nchannels_per_block_makeratio;
0193         unsigned int blocks_makeratio = threads_makeratio > 45 * totalChannels
0194                                             ? 1
0195                                             : (totalChannels * 45 + threads_makeratio - 1) / threads_makeratio;
0196         int sharedBytesMakeRatio = 5 * threads_makeratio * sizeof(SampleVector::Scalar);
0197         kernel_time_compute_makeratio<<<blocks_makeratio, threads_makeratio, sharedBytesMakeRatio, cudaStream>>>(
0198             scratch.sample_values.get(),
0199             scratch.sample_value_errors.get(),
0200             eventInputGPU.ebDigis.ids.get(),
0201             eventInputGPU.eeDigis.ids.get(),
0202             scratch.useless_sample_values.get(),
0203             scratch.pedestal_nums.get(),
0204             configParameters.amplitudeFitParametersEB,
0205             configParameters.amplitudeFitParametersEE,
0206             configParameters.timeFitParametersEB,
0207             configParameters.timeFitParametersEE,
0208             scratch.sumAAsNullHypot.get(),
0209             scratch.sum0sNullHypot.get(),
0210             scratch.tMaxAlphaBetas.get(),
0211             scratch.tMaxErrorAlphaBetas.get(),
0212             scratch.accTimeMax.get(),
0213             scratch.accTimeWgt.get(),
0214             scratch.tcState.get(),
0215             configParameters.timeFitParametersSizeEB,
0216             configParameters.timeFitParametersSizeEE,
0217             configParameters.timeFitLimitsFirstEB,
0218             configParameters.timeFitLimitsFirstEE,
0219             configParameters.timeFitLimitsSecondEB,
0220             configParameters.timeFitLimitsSecondEE,
0221             totalChannels,
0222             offsetForInputs);
0223         cudaCheck(cudaGetLastError());
0224
0225         auto const threads_findamplchi2 = threads_1d;
0226         auto const blocks_findamplchi2 = blocks_1d;
0227         int const sharedBytesFindAmplChi2 = 2 * threads_findamplchi2 * sizeof(SampleVector::Scalar);
0228         kernel_time_compute_findamplchi2_and_finish<<<blocks_findamplchi2,
0229                                                       threads_findamplchi2,
0230                                                       sharedBytesFindAmplChi2,
0231                                                       cudaStream>>>(scratch.sample_values.get(),
0232                                                                     scratch.sample_value_errors.get(),
0233                                                                     eventInputGPU.ebDigis.ids.get(),
0234                                                                     eventInputGPU.eeDigis.ids.get(),
0235                                                                     scratch.useless_sample_values.get(),
0236                                                                     scratch.tMaxAlphaBetas.get(),
0237                                                                     scratch.tMaxErrorAlphaBetas.get(),
0238                                                                     scratch.accTimeMax.get(),
0239                                                                     scratch.accTimeWgt.get(),
0240                                                                     configParameters.amplitudeFitParametersEB,
0241                                                                     configParameters.amplitudeFitParametersEE,
0242                                                                     scratch.sumAAsNullHypot.get(),
0243                                                                     scratch.sum0sNullHypot.get(),
0244                                                                     scratch.chi2sNullHypot.get(),
0245                                                                     scratch.tcState.get(),
0246                                                                     scratch.ampMaxAlphaBeta.get(),
0247                                                                     scratch.ampMaxError.get(),
0248                                                                     scratch.timeMax.get(),
0249                                                                     scratch.timeError.get(),
0250                                                                     totalChannels,
0251                                                                     offsetForInputs);
0252         cudaCheck(cudaGetLastError());
0253
0254         auto const threads_timecorr = 32;
0255         auto const blocks_timecorr =
0256             threads_timecorr > totalChannels ? 1 : (totalChannels + threads_timecorr - 1) / threads_timecorr;
0257         kernel_time_correction_and_finalize<<<blocks_timecorr, threads_timecorr, 0, cudaStream>>>(
0258             eventOutputGPU.recHitsEB.amplitude.get(),
0259             eventOutputGPU.recHitsEE.amplitude.get(),
0260             eventInputGPU.ebDigis.data.get(),
0261             eventInputGPU.ebDigis.ids.get(),
0262             eventInputGPU.eeDigis.data.get(),
0263             eventInputGPU.eeDigis.ids.get(),
0264             conditions.timeBiasCorrections.ebTimeCorrAmplitudeBins,
0265             conditions.timeBiasCorrections.eeTimeCorrAmplitudeBins,
0266             conditions.timeBiasCorrections.ebTimeCorrShiftBins,
0267             conditions.timeBiasCorrections.eeTimeCorrShiftBins,
0268             scratch.timeMax.get(),
0269             scratch.timeError.get(),
0270             conditions.pedestals.rms_x12,
0271             conditions.timeCalibConstants.values,
0272             eventOutputGPU.recHitsEB.jitter.get(),
0273             eventOutputGPU.recHitsEE.jitter.get(),
0274             eventOutputGPU.recHitsEB.jitterError.get(),
0275             eventOutputGPU.recHitsEE.jitterError.get(),
0276             eventOutputGPU.recHitsEB.flags.get(),
0277             eventOutputGPU.recHitsEE.flags.get(),
0278             conditions.timeBiasCorrections.ebTimeCorrAmplitudeBinsSize,
0279             conditions.timeBiasCorrections.eeTimeCorrAmplitudeBinsSize,
0280             configParameters.timeConstantTermEB,
0281             configParameters.timeConstantTermEE,
0282             conditions.timeOffsetConstant.getEBValue(),
0283             conditions.timeOffsetConstant.getEEValue(),
0284             configParameters.timeNconstEB,
0285             configParameters.timeNconstEE,
0286             configParameters.amplitudeThreshEB,
0287             configParameters.amplitudeThreshEE,
0288             configParameters.outOfTimeThreshG12pEB,
0289             configParameters.outOfTimeThreshG12pEE,
0290             configParameters.outOfTimeThreshG12mEB,
0291             configParameters.outOfTimeThreshG12mEE,
0292             configParameters.outOfTimeThreshG61pEB,
0293             configParameters.outOfTimeThreshG61pEE,
0294             configParameters.outOfTimeThreshG61mEB,
0295             configParameters.outOfTimeThreshG61mEE,
0296             offsetForHashes,
0297             offsetForInputs,
0298             totalChannels);
0299         cudaCheck(cudaGetLastError());
0300       }
0301     }
0302
0303   }  // namespace multifit
0304 }  // namespace ecal