plugins/alpaka/AmplitudeComputationCommonKernels.h

0001 #ifndef RecoLocalCalo_EcalRecProducers_plugins_alpaka_AmplitudeComputationCommonKernels_h
0002 #define RecoLocalCalo_EcalRecProducers_plugins_alpaka_AmplitudeComputationCommonKernels_h
0003
0004 #include <cstdlib>
0005 #include <limits>
0006 #include <alpaka/alpaka.hpp>
0007
0008 #include "CondFormats/EcalObjects/interface/alpaka/EcalMultifitConditionsDevice.h"
0009 #include "DataFormats/EcalDigi/interface/alpaka/EcalDigiDeviceCollection.h"
0010 #include "DataFormats/EcalRecHit/interface/alpaka/EcalUncalibratedRecHitDeviceCollection.h"
0011 #include "CondFormats/EcalObjects/interface/EcalPulseShapes.h"
0012 #include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
0013 #include "DataFormats/EcalDigi/interface/EcalMGPASample.h"
0014 #include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
0015 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0016 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0017 #include "RecoLocalCalo/EcalRecProducers/interface/EigenMatrixTypes_gpu.h"
0018
0019 #include "DeclsForKernels.h"
0020 #include "KernelHelpers.h"
0021
0022 namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {
0023
0024   ///
0025   /// assume kernel launch configuration is
0026   /// (MAXSAMPLES * nchannels, blocks)
0027   /// TODO: is there a point to split this kernel further to separate reductions
0028   ///
0029   class Kernel_prep_1d_and_initialize {
0030   public:
0031     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0032     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0033                                   EcalDigiDeviceCollection::ConstView digisDevEB,
0034                                   EcalDigiDeviceCollection::ConstView digisDevEE,
0035                                   EcalUncalibratedRecHitDeviceCollection::View uncalibRecHitsEB,
0036                                   EcalUncalibratedRecHitDeviceCollection::View uncalibRecHitsEE,
0037                                   EcalMultifitConditionsDevice::ConstView conditionsDev,
0038                                   ::ecal::multifit::SampleVector* amplitudes,
0039                                   ::ecal::multifit::SampleGainVector* gainsNoise,
0040                                   bool* hasSwitchToGain6,
0041                                   bool* hasSwitchToGain1,
0042                                   bool* isSaturated,
0043                                   char* acState,
0044                                   ::ecal::multifit::BXVectorType* bxs,
0045                                   bool const gainSwitchUseMaxSampleEB,
0046                                   bool const gainSwitchUseMaxSampleEE) const {
0047       constexpr bool dynamicPedestal = false;  //---- default to false, ok
0048       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0049       constexpr int sample_max = 5;
0050       constexpr int full_pulse_max = 9;
0051       auto const offsetForHashes = conditionsDev.offsetEE();
0052
0053       auto const nchannelsEB = digisDevEB.size();
0054       auto const nchannelsEE = digisDevEE.size();
0055       auto const nchannels = nchannelsEB + nchannelsEE;
0056       auto const totalElements = nchannels * nsamples;
0057
0058       auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0059
0060       char* shared_mem = alpaka::getDynSharedMem<char>(acc);
0061       auto* shr_hasSwitchToGain6 = reinterpret_cast<bool*>(shared_mem);
0062       auto* shr_hasSwitchToGain1 = shr_hasSwitchToGain6 + elemsPerBlock;
0063       auto* shr_hasSwitchToGain0 = shr_hasSwitchToGain1 + elemsPerBlock;
0064       auto* shr_isSaturated = shr_hasSwitchToGain0 + elemsPerBlock;
0065       auto* shr_hasSwitchToGain0_tmp = shr_isSaturated + elemsPerBlock;
0066       auto* shr_counts = reinterpret_cast<char*>(shr_hasSwitchToGain0_tmp) + elemsPerBlock;
0067
0068       for (auto block : cms::alpakatools::uniform_groups(acc, totalElements)) {
0069         for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0070           // set the output collection size scalars
0071           if (idx.global == 0) {
0072             uncalibRecHitsEB.size() = nchannelsEB;
0073             uncalibRecHitsEE.size() = nchannelsEE;
0074           }
0075
0076           auto const ch = idx.global / nsamples;
0077           // for accessing input arrays
0078           int const inputTx = ch >= nchannelsEB ? idx.global - nchannelsEB * nsamples : idx.global;
0079           // eb is first and then ee
0080           auto const* digis_in = ch >= nchannelsEB ? digisDevEE.data()->data() : digisDevEB.data()->data();
0081           auto const gainId = ecalMGPA::gainId(digis_in[inputTx]);
0082
0083           // store into shared mem for initialization
0084           shr_hasSwitchToGain6[idx.local] = gainId == EcalMgpaBitwiseGain6;
0085           shr_hasSwitchToGain1[idx.local] = gainId == EcalMgpaBitwiseGain1;
0086           shr_hasSwitchToGain0_tmp[idx.local] = gainId == EcalMgpaBitwiseGain0;
0087           shr_hasSwitchToGain0[idx.local] = shr_hasSwitchToGain0_tmp[idx.local];
0088           shr_counts[idx.local] = 0;
0089         }
0090
0091         alpaka::syncBlockThreads(acc);
0092
0093         for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0094           auto const sample = idx.local % nsamples;
0095
0096           // non-divergent branch (except for the last 4 threads)
0097           if (idx.local <= elemsPerBlock - 5) {
0098             CMS_UNROLL_LOOP
0099             for (int i = 0; i < 5; ++i)
0100               shr_counts[idx.local] += shr_hasSwitchToGain0[idx.local + i];
0101           }
0102           shr_isSaturated[idx.local] = shr_counts[idx.local] == 5;
0103
0104           //
0105           // unrolled reductions
0106           //
0107           if (sample < 5) {
0108             shr_hasSwitchToGain6[idx.local] = shr_hasSwitchToGain6[idx.local] || shr_hasSwitchToGain6[idx.local + 5];
0109             shr_hasSwitchToGain1[idx.local] = shr_hasSwitchToGain1[idx.local] || shr_hasSwitchToGain1[idx.local + 5];
0110
0111             // duplication of hasSwitchToGain0 in order not to
0112             // introduce another syncthreads
0113             shr_hasSwitchToGain0_tmp[idx.local] =
0114                 shr_hasSwitchToGain0_tmp[idx.local] || shr_hasSwitchToGain0_tmp[idx.local + 5];
0115           }
0116         }
0117
0118         alpaka::syncBlockThreads(acc);
0119
0120         for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0121           auto const sample = idx.local % nsamples;
0122
0123           if (sample < 2) {
0124             // note, both threads per channel take value [3] twice to avoid another if
0125             shr_hasSwitchToGain6[idx.local] = shr_hasSwitchToGain6[idx.local] || shr_hasSwitchToGain6[idx.local + 2] ||
0126                                               shr_hasSwitchToGain6[idx.local + 3];
0127             shr_hasSwitchToGain1[idx.local] = shr_hasSwitchToGain1[idx.local] || shr_hasSwitchToGain1[idx.local + 2] ||
0128                                               shr_hasSwitchToGain1[idx.local + 3];
0129
0130             shr_hasSwitchToGain0_tmp[idx.local] = shr_hasSwitchToGain0_tmp[idx.local] ||
0131                                                   shr_hasSwitchToGain0_tmp[idx.local + 2] ||
0132                                                   shr_hasSwitchToGain0_tmp[idx.local + 3];
0133
0134             // sample < 2 -> first 2 threads of each channel will be used here
0135             // => 0 -> will compare 3 and 4 and put into 0
0136             // => 1 -> will compare 4 and 5 and put into 1
0137             shr_isSaturated[idx.local] = shr_isSaturated[idx.local + 3] || shr_isSaturated[idx.local + 4];
0138           }
0139         }
0140
0141         alpaka::syncBlockThreads(acc);
0142
0143         for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0144           auto const ch = idx.global / nsamples;
0145           auto const sample = idx.local % nsamples;
0146
0147           if (sample == 0) {
0148             shr_hasSwitchToGain6[idx.local] = shr_hasSwitchToGain6[idx.local] || shr_hasSwitchToGain6[idx.local + 1];
0149             shr_hasSwitchToGain1[idx.local] = shr_hasSwitchToGain1[idx.local] || shr_hasSwitchToGain1[idx.local + 1];
0150             shr_hasSwitchToGain0_tmp[idx.local] =
0151                 shr_hasSwitchToGain0_tmp[idx.local] || shr_hasSwitchToGain0_tmp[idx.local + 1];
0152
0153             hasSwitchToGain6[ch] = shr_hasSwitchToGain6[idx.local];
0154             hasSwitchToGain1[ch] = shr_hasSwitchToGain1[idx.local];
0155
0156             shr_isSaturated[idx.local + 3] = shr_isSaturated[idx.local] || shr_isSaturated[idx.local + 1];
0157             isSaturated[ch] = shr_isSaturated[idx.local + 3];
0158           }
0159         }
0160
0161         // TODO: w/o this sync, there is a race
0162         // if (idx.local == sample_max) below uses max sample thread, not for 0 sample
0163         // check if we can remove it
0164         alpaka::syncBlockThreads(acc);
0165
0166         for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0167           auto const ch = idx.global / nsamples;
0168           auto const sample = idx.local % nsamples;
0169
0170           // for accessing input arrays
0171           int const inputCh = ch >= nchannelsEB ? ch - nchannelsEB : ch;
0172           int const inputTx = ch >= nchannelsEB ? idx.global - nchannelsEB * nsamples : idx.global;
0173
0174           auto const* dids = ch >= nchannelsEB ? digisDevEE.id() : digisDevEB.id();
0175           auto const did = DetId{dids[inputCh]};
0176           auto const isBarrel = did.subdetId() == EcalBarrel;
0177           // TODO offset for ee, 0 for eb
0178           auto const hashedId = isBarrel ? reconstruction::hashedIndexEB(did.rawId())
0179                                          : offsetForHashes + reconstruction::hashedIndexEE(did.rawId());
0180
0181           // eb is first and then ee
0182           auto const* digis_in = ch >= nchannelsEB ? digisDevEE.data()->data() : digisDevEB.data()->data();
0183
0184           auto* amplitudesForMinimization = reinterpret_cast<::ecal::multifit::SampleVector*>(
0185               ch >= nchannelsEB ? uncalibRecHitsEE.outOfTimeAmplitudes()->data()
0186                                 : uncalibRecHitsEB.outOfTimeAmplitudes()->data());
0187           auto* energies = ch >= nchannelsEB ? uncalibRecHitsEE.amplitude() : uncalibRecHitsEB.amplitude();
0188           auto* chi2 = ch >= nchannelsEB ? uncalibRecHitsEE.chi2() : uncalibRecHitsEB.chi2();
0189           auto* g_pedestal = ch >= nchannelsEB ? uncalibRecHitsEE.pedestal() : uncalibRecHitsEB.pedestal();
0190           auto* dids_out = ch >= nchannelsEB ? uncalibRecHitsEE.id() : uncalibRecHitsEB.id();
0191           auto* flags = ch >= nchannelsEB ? uncalibRecHitsEE.flags() : uncalibRecHitsEB.flags();
0192
0193           auto const adc = ecalMGPA::adc(digis_in[inputTx]);
0194           auto const gainId = ecalMGPA::gainId(digis_in[inputTx]);
0195           ::ecal::multifit::SampleVector::Scalar amplitude = 0.;
0196           ::ecal::multifit::SampleVector::Scalar pedestal = 0.;
0197           ::ecal::multifit::SampleVector::Scalar gainratio = 0.;
0198
0199           // TODO: divergent branch
0200           if (gainId == 0 || gainId == 3) {
0201             pedestal = conditionsDev.pedestals_mean_x1()[hashedId];
0202             gainratio = conditionsDev.gain6Over1()[hashedId] * conditionsDev.gain12Over6()[hashedId];
0203             gainsNoise[ch](sample) = 2;
0204           } else if (gainId == 1) {
0205             pedestal = conditionsDev.pedestals_mean_x12()[hashedId];
0206             gainratio = 1.;
0207             gainsNoise[ch](sample) = 0;
0208           } else if (gainId == 2) {
0209             pedestal = conditionsDev.pedestals_mean_x6()[hashedId];
0210             gainratio = conditionsDev.gain12Over6()[hashedId];
0211             gainsNoise[ch](sample) = 1;
0212           }
0213
0214           // TODO: compile time constant -> branch should be non-divergent
0215           if (dynamicPedestal)
0216             amplitude = static_cast<::ecal::multifit::SampleVector::Scalar>(adc) * gainratio;
0217           else
0218             amplitude = (static_cast<::ecal::multifit::SampleVector::Scalar>(adc) - pedestal) * gainratio;
0219           amplitudes[ch][sample] = amplitude;
0220
0221 #ifdef ECAL_RECO_ALPAKA_DEBUG
0222           printf("%d %d %d %d %f %f %f\n", idx.global, ch, sample, adc, amplitude, pedestal, gainratio);
0223           if (adc == 0)
0224             printf("adc is zero\n");
0225 #endif
0226
0227           //
0228           // initialization
0229           //
0230           amplitudesForMinimization[inputCh](sample) = 0;
0231           bxs[ch](sample) = sample - 5;
0232
0233           // select the thread for the max sample
0234           //---> hardcoded above to be 5th sample, ok
0235           if (sample == sample_max) {
0236             //
0237             // initialization
0238             //
0239             acState[ch] = static_cast<char>(MinimizationState::NotFinished);
0240             energies[inputCh] = 0;
0241             chi2[inputCh] = 0;
0242             g_pedestal[inputCh] = 0;
0243             uint32_t flag = 0;
0244             dids_out[inputCh] = did.rawId();
0245
0246             // start of this channel in shared mem
0247             auto const chStart = idx.local - sample_max;
0248             // thread for the max sample in shared mem
0249             auto const threadMax = idx.local;
0250             auto const gainSwitchUseMaxSample = isBarrel ? gainSwitchUseMaxSampleEB : gainSwitchUseMaxSampleEE;
0251
0252             // this flag setting is applied to all of the cases
0253             if (shr_hasSwitchToGain6[chStart])
0254               flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain6;
0255             if (shr_hasSwitchToGain1[chStart])
0256               flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain1;
0257
0258             // this corresponds to cpu branching on lastSampleBeforeSaturation
0259             // likely false
0260             // check only for the idx.local corresponding to sample==0
0261             if (sample == 0 && shr_hasSwitchToGain0_tmp[idx.local]) {
0262               // assign for the case some sample having gainId == 0
0263               //energies[inputCh] = amplitudes[ch][sample_max];
0264               energies[inputCh] = amplitude;
0265
0266               // check if samples before sample_max have true
0267               bool saturated_before_max = false;
0268               CMS_UNROLL_LOOP
0269               for (char ii = 0; ii < 5; ++ii)
0270                 saturated_before_max = saturated_before_max || shr_hasSwitchToGain0[chStart + ii];
0271
0272               // if saturation is in the max sample and not in the first 5
0273               if (!saturated_before_max && shr_hasSwitchToGain0[threadMax])
0274                 energies[inputCh] = 49140;  // 4095 * 12 (maximum ADC range * MultiGainPreAmplifier (MGPA) gain)
0275                                             // This is the actual maximum range that is set when we saturate.
0276                                             //---- AM FIXME : no pedestal subtraction???
0277                                             //It should be "(4095. - pedestal) * gainratio"
0278
0279               // set state flag to terminate further processing of this channel
0280               acState[ch] = static_cast<char>(MinimizationState::Precomputed);
0281               flag |= 0x1 << EcalUncalibratedRecHit::kSaturated;
0282               flags[inputCh] = flag;
0283               continue;
0284             }
0285
0286             // according to cpu version
0287             //            auto max_amplitude = amplitudes[ch][sample_max];
0288             auto const max_amplitude = amplitude;
0289             // pulse shape template value
0290             auto shape_value = conditionsDev.pulseShapes()[hashedId][full_pulse_max - 7];
0291             // note, no syncing as the same thread will be accessing here
0292             bool hasGainSwitch =
0293                 shr_hasSwitchToGain6[chStart] || shr_hasSwitchToGain1[chStart] || shr_isSaturated[chStart + 3];
0294
0295             // pedestal is final unconditionally
0296             g_pedestal[inputCh] = pedestal;
0297             if (hasGainSwitch && gainSwitchUseMaxSample) {
0298               // thread for sample=0 will access the right guys
0299               energies[inputCh] = max_amplitude / shape_value;
0300               acState[ch] = static_cast<char>(MinimizationState::Precomputed);
0301               flags[inputCh] = flag;
0302               continue;
0303             }
0304
0305             // will be used in the future for setting state
0306             auto const rmsForChecking = conditionsDev.pedestals_rms_x12()[hashedId];
0307
0308             // this happens cause sometimes rms_x12 is 0...
0309             // needs to be checkec why this is the case
0310             // general case here is that noisecov is a Zero matrix
0311             if (rmsForChecking == 0) {
0312               acState[ch] = static_cast<char>(MinimizationState::Precomputed);
0313               flags[inputCh] = flag;
0314               continue;
0315             }
0316
0317             // for the case when no shortcuts were taken
0318             flags[inputCh] = flag;
0319           }
0320         }
0321       }
0322     }
0323   };
0324
0325   ///
0326   /// assume kernel launch configuration is
0327   /// ([MAXSAMPLES, MAXSAMPLES], nchannels)
0328   ///
0329   class Kernel_prep_2d {
0330   public:
0331     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0332     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0333                                   EcalDigiDeviceCollection::ConstView digisDevEB,
0334                                   EcalDigiDeviceCollection::ConstView digisDevEE,
0335                                   EcalMultifitConditionsDevice::ConstView conditionsDev,
0336                                   ::ecal::multifit::SampleGainVector const* gainsNoise,
0337                                   ::ecal::multifit::SampleMatrix* noisecov,
0338                                   ::ecal::multifit::PulseMatrixType* pulse_matrix,
0339                                   bool const* hasSwitchToGain6,
0340                                   bool const* hasSwitchToGain1,
0341                                   bool const* isSaturated) const {
0342       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0343       auto const offsetForHashes = conditionsDev.offsetEE();
0344       auto const nchannelsEB = digisDevEB.size();
0345       constexpr float addPedestalUncertainty = 0.f;
0346       constexpr bool dynamicPedestal = false;
0347       constexpr bool simplifiedNoiseModelForGainSwitch = true;  //---- default is true
0348
0349       // pulse matrix
0350       auto const* pulse_shapes = reinterpret_cast<const EcalPulseShape*>(conditionsDev.pulseShapes()->data());
0351
0352       auto const blockDimX = alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[1u];
0353       auto const elemsPerBlockX = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[1u];
0354       auto const elemsPerBlockY = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0355       Vec2D const size_2d = {elemsPerBlockY, blockDimX * elemsPerBlockX};  // {y, x} coordinates
0356
0357       for (auto ndindex : cms::alpakatools::uniform_elements_nd(acc, size_2d)) {
0358         auto const ch = ndindex[1] / nsamples;
0359         auto const tx = ndindex[1] % nsamples;
0360         auto const ty = ndindex[0];
0361
0362         // to access input arrays (ids and digis only)
0363         int const inputCh = ch >= nchannelsEB ? ch - nchannelsEB : ch;
0364         auto const* dids = ch >= nchannelsEB ? digisDevEE.id() : digisDevEB.id();
0365
0366         auto const did = DetId{dids[inputCh]};
0367         auto const isBarrel = did.subdetId() == EcalBarrel;
0368         auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
0369                                        : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
0370         auto const* G12SamplesCorrelation = isBarrel ? conditionsDev.sampleCorrelation_EB_G12().data()
0371                                                      : conditionsDev.sampleCorrelation_EE_G12().data();
0372         auto const* G6SamplesCorrelation =
0373             isBarrel ? conditionsDev.sampleCorrelation_EB_G6().data() : conditionsDev.sampleCorrelation_EE_G6().data();
0374         auto const* G1SamplesCorrelation =
0375             isBarrel ? conditionsDev.sampleCorrelation_EB_G1().data() : conditionsDev.sampleCorrelation_EE_G1().data();
0376         auto const hasGainSwitch = hasSwitchToGain6[ch] || hasSwitchToGain1[ch] || isSaturated[ch];
0377
0378         auto const vidx = std::abs(static_cast<int>(ty) - static_cast<int>(tx));
0379
0380         // non-divergent branch for all threads per block
0381         if (hasGainSwitch) {
0382           // TODO: did not include simplified noise model
0383           float noise_value = 0;
0384
0385           // non-divergent branch - all threads per block
0386           // TODO: all of these constants indicate that
0387           // that these parts could be splitted into completely different
0388           // kernels and run one of them only depending on the config
0389           if (simplifiedNoiseModelForGainSwitch) {
0390             constexpr int isample_max = 5;  // according to cpu defs
0391             auto const gainidx = gainsNoise[ch][isample_max];
0392
0393             // non-divergent branches
0394             if (gainidx == 0) {
0395               auto const rms_x12 = conditionsDev.pedestals_rms_x12()[hashedId];
0396               noise_value = rms_x12 * rms_x12 * G12SamplesCorrelation[vidx];
0397             } else if (gainidx == 1) {
0398               auto const gain12Over6 = conditionsDev.gain12Over6()[hashedId];
0399               auto const rms_x6 = conditionsDev.pedestals_rms_x6()[hashedId];
0400               noise_value = gain12Over6 * gain12Over6 * rms_x6 * rms_x6 * G6SamplesCorrelation[vidx];
0401             } else if (gainidx == 2) {
0402               auto const gain12Over6 = conditionsDev.gain12Over6()[hashedId];
0403               auto const gain6Over1 = conditionsDev.gain6Over1()[hashedId];
0404               auto const gain12Over1 = gain12Over6 * gain6Over1;
0405               auto const rms_x1 = conditionsDev.pedestals_rms_x1()[hashedId];
0406               noise_value = gain12Over1 * gain12Over1 * rms_x1 * rms_x1 * G1SamplesCorrelation[vidx];
0407             }
0408             if (!dynamicPedestal && addPedestalUncertainty > 0.f)
0409               noise_value += addPedestalUncertainty * addPedestalUncertainty;
0410           } else {
0411             int gainidx = 0;
0412             char mask = gainidx;
0413             int pedestal = gainsNoise[ch][ty] == mask ? 1 : 0;
0414             //            NB: gainratio is 1, that is why it does not appear in the formula
0415             auto const rms_x12 = conditionsDev.pedestals_rms_x12()[hashedId];
0416             noise_value += rms_x12 * rms_x12 * pedestal * G12SamplesCorrelation[vidx];
0417             // non-divergent branch
0418             if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
0419               noise_value += addPedestalUncertainty * addPedestalUncertainty * pedestal;  // gainratio is 1
0420             }
0421
0422             //
0423             gainidx = 1;
0424             mask = gainidx;
0425             pedestal = gainsNoise[ch][ty] == mask ? 1 : 0;
0426             auto const gain12Over6 = conditionsDev.gain12Over6()[hashedId];
0427             auto const rms_x6 = conditionsDev.pedestals_rms_x6()[hashedId];
0428             noise_value += gain12Over6 * gain12Over6 * rms_x6 * rms_x6 * pedestal * G6SamplesCorrelation[vidx];
0429             // non-divergent branch
0430             if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
0431               noise_value += gain12Over6 * gain12Over6 * addPedestalUncertainty * addPedestalUncertainty * pedestal;
0432             }
0433
0434             //
0435             gainidx = 2;
0436             mask = gainidx;
0437             pedestal = gainsNoise[ch][ty] == mask ? 1 : 0;
0438             auto const gain6Over1 = conditionsDev.gain6Over1()[hashedId];
0439             auto const gain12Over1 = gain12Over6 * gain6Over1;
0440             auto const rms_x1 = conditionsDev.pedestals_rms_x1()[hashedId];
0441             noise_value += gain12Over1 * gain12Over1 * rms_x1 * rms_x1 * pedestal * G1SamplesCorrelation[vidx];
0442             // non-divergent branch
0443             if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
0444               noise_value += gain12Over1 * gain12Over1 * addPedestalUncertainty * addPedestalUncertainty * pedestal;
0445             }
0446           }
0447
0448           noisecov[ch](ty, tx) = noise_value;
0449         } else {
0450           auto const rms = conditionsDev.pedestals_rms_x12()[hashedId];
0451           float noise_value = rms * rms * G12SamplesCorrelation[vidx];
0452           if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
0453             //----  add fully correlated component to noise covariance to inflate pedestal uncertainty
0454             noise_value += addPedestalUncertainty * addPedestalUncertainty;
0455           }
0456           noisecov[ch](ty, tx) = noise_value;
0457         }
0458
0459         auto const posToAccess = 9 - static_cast<int>(tx) + static_cast<int>(ty);  // see cpu for reference
0460         float const value = posToAccess >= 7 ? pulse_shapes[hashedId].pdfval[posToAccess - 7] : 0;
0461         pulse_matrix[ch](ty, tx) = value;
0462       }
0463     }
0464   };
0465
0466 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit
0467
0468 namespace alpaka::trait {
0469   using namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit;
0470
0471   //! The trait for getting the size of the block shared dynamic memory for Kernel_prep_1d_and_initialize.
0472   template <typename TAcc>
0473   struct BlockSharedMemDynSizeBytes<Kernel_prep_1d_and_initialize, TAcc> {
0474     //! \return The size of the shared memory allocated for a block.
0475     template <typename TVec, typename... TArgs>
0476     ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_prep_1d_and_initialize const&,
0477                                                                  TVec const& threadsPerBlock,
0478                                                                  TVec const& elemsPerThread,
0479                                                                  TArgs const&...) -> std::size_t {
0480       // return the amount of dynamic shared memory needed
0481       std::size_t bytes = threadsPerBlock[0u] * elemsPerThread[0u] * (5 * sizeof(bool) + sizeof(char));
0482       return bytes;
0483     }
0484   };
0485 }  // namespace alpaka::trait
0486
0487 #endif  // RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationCommonKernels_h