Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2025-01-22 07:34:22

0001 #ifndef RecoLocalCalo_EcalRecProducers_plugins_alpaka_AmplitudeComputationCommonKernels_h
0002 #define RecoLocalCalo_EcalRecProducers_plugins_alpaka_AmplitudeComputationCommonKernels_h
0003 
0004 #include <cstdlib>
0005 #include <limits>
0006 #include <alpaka/alpaka.hpp>
0007 
0008 #include "CondFormats/EcalObjects/interface/alpaka/EcalMultifitConditionsDevice.h"
0009 #include "DataFormats/EcalDigi/interface/alpaka/EcalDigiDeviceCollection.h"
0010 #include "DataFormats/EcalRecHit/interface/alpaka/EcalUncalibratedRecHitDeviceCollection.h"
0011 #include "CondFormats/EcalObjects/interface/EcalPulseShapes.h"
0012 #include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
0013 #include "DataFormats/EcalDigi/interface/EcalMGPASample.h"
0014 #include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
0015 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0016 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
0017 #include "RecoLocalCalo/EcalRecProducers/interface/EigenMatrixTypes_gpu.h"
0018 
0019 #include "DeclsForKernels.h"
0020 #include "KernelHelpers.h"
0021 
0022 namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {
0023 
0024   ///
0025   /// assume kernel launch configuration is
0026   /// (MAXSAMPLES * nchannels, blocks)
0027   /// TODO: is there a point to split this kernel further to separate reductions
0028   ///
0029   class Kernel_prep_1d_and_initialize {
0030   public:
0031     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0032                                   EcalDigiDeviceCollection::ConstView digisDevEB,
0033                                   EcalDigiDeviceCollection::ConstView digisDevEE,
0034                                   EcalUncalibratedRecHitDeviceCollection::View uncalibRecHitsEB,
0035                                   EcalUncalibratedRecHitDeviceCollection::View uncalibRecHitsEE,
0036                                   EcalMultifitConditionsDevice::ConstView conditionsDev,
0037                                   ::ecal::multifit::SampleVector* amplitudes,
0038                                   ::ecal::multifit::SampleGainVector* gainsNoise,
0039                                   bool* hasSwitchToGain6,
0040                                   bool* hasSwitchToGain1,
0041                                   bool* isSaturated,
0042                                   char* acState,
0043                                   ::ecal::multifit::BXVectorType* bxs,
0044                                   bool const gainSwitchUseMaxSampleEB,
0045                                   bool const gainSwitchUseMaxSampleEE) const {
0046       constexpr bool dynamicPedestal = false;  //---- default to false, ok
0047       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0048       constexpr int sample_max = 5;
0049       constexpr int full_pulse_max = 9;
0050       auto const offsetForHashes = conditionsDev.offsetEE();
0051 
0052       auto const nchannelsEB = digisDevEB.size();
0053       auto const nchannelsEE = digisDevEE.size();
0054       auto const nchannels = nchannelsEB + nchannelsEE;
0055       auto const totalElements = nchannels * nsamples;
0056 
0057       auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0058 
0059       char* shared_mem = alpaka::getDynSharedMem<char>(acc);
0060       auto* shr_hasSwitchToGain6 = reinterpret_cast<bool*>(shared_mem);
0061       auto* shr_hasSwitchToGain1 = shr_hasSwitchToGain6 + elemsPerBlock;
0062       auto* shr_hasSwitchToGain0 = shr_hasSwitchToGain1 + elemsPerBlock;
0063       auto* shr_isSaturated = shr_hasSwitchToGain0 + elemsPerBlock;
0064       auto* shr_hasSwitchToGain0_tmp = shr_isSaturated + elemsPerBlock;
0065       auto* shr_counts = reinterpret_cast<char*>(shr_hasSwitchToGain0_tmp) + elemsPerBlock;
0066 
0067       for (auto block : cms::alpakatools::uniform_groups(acc, totalElements)) {
0068         for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0069           // set the output collection size scalars
0070           if (idx.global == 0) {
0071             uncalibRecHitsEB.size() = nchannelsEB;
0072             uncalibRecHitsEE.size() = nchannelsEE;
0073           }
0074 
0075           auto const ch = idx.global / nsamples;
0076           // for accessing input arrays
0077           int const inputTx = ch >= nchannelsEB ? idx.global - nchannelsEB * nsamples : idx.global;
0078           // eb is first and then ee
0079           auto const* digis_in = ch >= nchannelsEB ? digisDevEE.data()->data() : digisDevEB.data()->data();
0080           auto const gainId = ecalMGPA::gainId(digis_in[inputTx]);
0081 
0082           // store into shared mem for initialization
0083           shr_hasSwitchToGain6[idx.local] = gainId == EcalMgpaBitwiseGain6;
0084           shr_hasSwitchToGain1[idx.local] = gainId == EcalMgpaBitwiseGain1;
0085           shr_hasSwitchToGain0_tmp[idx.local] = gainId == EcalMgpaBitwiseGain0;
0086           shr_hasSwitchToGain0[idx.local] = shr_hasSwitchToGain0_tmp[idx.local];
0087           shr_counts[idx.local] = 0;
0088         }
0089 
0090         alpaka::syncBlockThreads(acc);
0091 
0092         for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0093           auto const sample = idx.local % nsamples;
0094 
0095           // non-divergent branch (except for the last 4 threads)
0096           if (idx.local <= elemsPerBlock - 5) {
0097             CMS_UNROLL_LOOP
0098             for (int i = 0; i < 5; ++i)
0099               shr_counts[idx.local] += shr_hasSwitchToGain0[idx.local + i];
0100           }
0101           shr_isSaturated[idx.local] = shr_counts[idx.local] == 5;
0102 
0103           //
0104           // unrolled reductions
0105           //
0106           if (sample < 5) {
0107             shr_hasSwitchToGain6[idx.local] = shr_hasSwitchToGain6[idx.local] || shr_hasSwitchToGain6[idx.local + 5];
0108             shr_hasSwitchToGain1[idx.local] = shr_hasSwitchToGain1[idx.local] || shr_hasSwitchToGain1[idx.local + 5];
0109 
0110             // duplication of hasSwitchToGain0 in order not to
0111             // introduce another syncthreads
0112             shr_hasSwitchToGain0_tmp[idx.local] =
0113                 shr_hasSwitchToGain0_tmp[idx.local] || shr_hasSwitchToGain0_tmp[idx.local + 5];
0114           }
0115         }
0116 
0117         alpaka::syncBlockThreads(acc);
0118 
0119         for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0120           auto const sample = idx.local % nsamples;
0121 
0122           if (sample < 2) {
0123             // note, both threads per channel take value [3] twice to avoid another if
0124             shr_hasSwitchToGain6[idx.local] = shr_hasSwitchToGain6[idx.local] || shr_hasSwitchToGain6[idx.local + 2] ||
0125                                               shr_hasSwitchToGain6[idx.local + 3];
0126             shr_hasSwitchToGain1[idx.local] = shr_hasSwitchToGain1[idx.local] || shr_hasSwitchToGain1[idx.local + 2] ||
0127                                               shr_hasSwitchToGain1[idx.local + 3];
0128 
0129             shr_hasSwitchToGain0_tmp[idx.local] = shr_hasSwitchToGain0_tmp[idx.local] ||
0130                                                   shr_hasSwitchToGain0_tmp[idx.local + 2] ||
0131                                                   shr_hasSwitchToGain0_tmp[idx.local + 3];
0132 
0133             // sample < 2 -> first 2 threads of each channel will be used here
0134             // => 0 -> will compare 3 and 4 and put into 0
0135             // => 1 -> will compare 4 and 5 and put into 1
0136             shr_isSaturated[idx.local] = shr_isSaturated[idx.local + 3] || shr_isSaturated[idx.local + 4];
0137           }
0138         }
0139 
0140         alpaka::syncBlockThreads(acc);
0141 
0142         for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0143           auto const ch = idx.global / nsamples;
0144           auto const sample = idx.local % nsamples;
0145 
0146           if (sample == 0) {
0147             shr_hasSwitchToGain6[idx.local] = shr_hasSwitchToGain6[idx.local] || shr_hasSwitchToGain6[idx.local + 1];
0148             shr_hasSwitchToGain1[idx.local] = shr_hasSwitchToGain1[idx.local] || shr_hasSwitchToGain1[idx.local + 1];
0149             shr_hasSwitchToGain0_tmp[idx.local] =
0150                 shr_hasSwitchToGain0_tmp[idx.local] || shr_hasSwitchToGain0_tmp[idx.local + 1];
0151 
0152             hasSwitchToGain6[ch] = shr_hasSwitchToGain6[idx.local];
0153             hasSwitchToGain1[ch] = shr_hasSwitchToGain1[idx.local];
0154 
0155             shr_isSaturated[idx.local + 3] = shr_isSaturated[idx.local] || shr_isSaturated[idx.local + 1];
0156             isSaturated[ch] = shr_isSaturated[idx.local + 3];
0157           }
0158         }
0159 
0160         // TODO: w/o this sync, there is a race
0161         // if (idx.local == sample_max) below uses max sample thread, not for 0 sample
0162         // check if we can remove it
0163         alpaka::syncBlockThreads(acc);
0164 
0165         for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0166           auto const ch = idx.global / nsamples;
0167           auto const sample = idx.local % nsamples;
0168 
0169           // for accessing input arrays
0170           int const inputCh = ch >= nchannelsEB ? ch - nchannelsEB : ch;
0171           int const inputTx = ch >= nchannelsEB ? idx.global - nchannelsEB * nsamples : idx.global;
0172 
0173           auto const* dids = ch >= nchannelsEB ? digisDevEE.id() : digisDevEB.id();
0174           auto const did = DetId{dids[inputCh]};
0175           auto const isBarrel = did.subdetId() == EcalBarrel;
0176           // TODO offset for ee, 0 for eb
0177           auto const hashedId = isBarrel ? reconstruction::hashedIndexEB(did.rawId())
0178                                          : offsetForHashes + reconstruction::hashedIndexEE(did.rawId());
0179 
0180           // eb is first and then ee
0181           auto const* digis_in = ch >= nchannelsEB ? digisDevEE.data()->data() : digisDevEB.data()->data();
0182 
0183           auto* amplitudesForMinimization = reinterpret_cast<::ecal::multifit::SampleVector*>(
0184               ch >= nchannelsEB ? uncalibRecHitsEE.outOfTimeAmplitudes()->data()
0185                                 : uncalibRecHitsEB.outOfTimeAmplitudes()->data());
0186           auto* energies = ch >= nchannelsEB ? uncalibRecHitsEE.amplitude() : uncalibRecHitsEB.amplitude();
0187           auto* chi2 = ch >= nchannelsEB ? uncalibRecHitsEE.chi2() : uncalibRecHitsEB.chi2();
0188           auto* g_pedestal = ch >= nchannelsEB ? uncalibRecHitsEE.pedestal() : uncalibRecHitsEB.pedestal();
0189           auto* dids_out = ch >= nchannelsEB ? uncalibRecHitsEE.id() : uncalibRecHitsEB.id();
0190           auto* flags = ch >= nchannelsEB ? uncalibRecHitsEE.flags() : uncalibRecHitsEB.flags();
0191 
0192           auto const adc = ecalMGPA::adc(digis_in[inputTx]);
0193           auto const gainId = ecalMGPA::gainId(digis_in[inputTx]);
0194           ::ecal::multifit::SampleVector::Scalar amplitude = 0.;
0195           ::ecal::multifit::SampleVector::Scalar pedestal = 0.;
0196           ::ecal::multifit::SampleVector::Scalar gainratio = 0.;
0197 
0198           // TODO: divergent branch
0199           if (gainId == 0 || gainId == 3) {
0200             pedestal = conditionsDev.pedestals_mean_x1()[hashedId];
0201             gainratio = conditionsDev.gain6Over1()[hashedId] * conditionsDev.gain12Over6()[hashedId];
0202             gainsNoise[ch](sample) = 2;
0203           } else if (gainId == 1) {
0204             pedestal = conditionsDev.pedestals_mean_x12()[hashedId];
0205             gainratio = 1.;
0206             gainsNoise[ch](sample) = 0;
0207           } else if (gainId == 2) {
0208             pedestal = conditionsDev.pedestals_mean_x6()[hashedId];
0209             gainratio = conditionsDev.gain12Over6()[hashedId];
0210             gainsNoise[ch](sample) = 1;
0211           }
0212 
0213           // TODO: compile time constant -> branch should be non-divergent
0214           if (dynamicPedestal)
0215             amplitude = static_cast<::ecal::multifit::SampleVector::Scalar>(adc) * gainratio;
0216           else
0217             amplitude = (static_cast<::ecal::multifit::SampleVector::Scalar>(adc) - pedestal) * gainratio;
0218           amplitudes[ch][sample] = amplitude;
0219 
0220 #ifdef ECAL_RECO_ALPAKA_DEBUG
0221           printf("%d %d %d %d %f %f %f\n", idx.global, ch, sample, adc, amplitude, pedestal, gainratio);
0222           if (adc == 0)
0223             printf("adc is zero\n");
0224 #endif
0225 
0226           //
0227           // initialization
0228           //
0229           amplitudesForMinimization[inputCh](sample) = 0;
0230           bxs[ch](sample) = sample - 5;
0231 
0232           // select the thread for the max sample
0233           //---> hardcoded above to be 5th sample, ok
0234           if (sample == sample_max) {
0235             //
0236             // initialization
0237             //
0238             acState[ch] = static_cast<char>(MinimizationState::NotFinished);
0239             energies[inputCh] = 0;
0240             chi2[inputCh] = 0;
0241             g_pedestal[inputCh] = 0;
0242             uint32_t flag = 0;
0243             dids_out[inputCh] = did.rawId();
0244 
0245             // start of this channel in shared mem
0246             auto const chStart = idx.local - sample_max;
0247             // thread for the max sample in shared mem
0248             auto const threadMax = idx.local;
0249             auto const gainSwitchUseMaxSample = isBarrel ? gainSwitchUseMaxSampleEB : gainSwitchUseMaxSampleEE;
0250 
0251             // this flag setting is applied to all of the cases
0252             if (shr_hasSwitchToGain6[chStart])
0253               flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain6;
0254             if (shr_hasSwitchToGain1[chStart])
0255               flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain1;
0256 
0257             // this corresponds to cpu branching on lastSampleBeforeSaturation
0258             // likely false
0259             // check only for the idx.local corresponding to sample==0
0260             if (sample == 0 && shr_hasSwitchToGain0_tmp[idx.local]) {
0261               // assign for the case some sample having gainId == 0
0262               //energies[inputCh] = amplitudes[ch][sample_max];
0263               energies[inputCh] = amplitude;
0264 
0265               // check if samples before sample_max have true
0266               bool saturated_before_max = false;
0267               CMS_UNROLL_LOOP
0268               for (char ii = 0; ii < 5; ++ii)
0269                 saturated_before_max = saturated_before_max || shr_hasSwitchToGain0[chStart + ii];
0270 
0271               // if saturation is in the max sample and not in the first 5
0272               if (!saturated_before_max && shr_hasSwitchToGain0[threadMax])
0273                 energies[inputCh] = 49140;  // 4095 * 12 (maximum ADC range * MultiGainPreAmplifier (MGPA) gain)
0274                                             // This is the actual maximum range that is set when we saturate.
0275                                             //---- AM FIXME : no pedestal subtraction???
0276                                             //It should be "(4095. - pedestal) * gainratio"
0277 
0278               // set state flag to terminate further processing of this channel
0279               acState[ch] = static_cast<char>(MinimizationState::Precomputed);
0280               flag |= 0x1 << EcalUncalibratedRecHit::kSaturated;
0281               flags[inputCh] = flag;
0282               continue;
0283             }
0284 
0285             // according to cpu version
0286             //            auto max_amplitude = amplitudes[ch][sample_max];
0287             auto const max_amplitude = amplitude;
0288             // pulse shape template value
0289             auto shape_value = conditionsDev.pulseShapes()[hashedId][full_pulse_max - 7];
0290             // note, no syncing as the same thread will be accessing here
0291             bool hasGainSwitch =
0292                 shr_hasSwitchToGain6[chStart] || shr_hasSwitchToGain1[chStart] || shr_isSaturated[chStart + 3];
0293 
0294             // pedestal is final unconditionally
0295             g_pedestal[inputCh] = pedestal;
0296             if (hasGainSwitch && gainSwitchUseMaxSample) {
0297               // thread for sample=0 will access the right guys
0298               energies[inputCh] = max_amplitude / shape_value;
0299               acState[ch] = static_cast<char>(MinimizationState::Precomputed);
0300               flags[inputCh] = flag;
0301               continue;
0302             }
0303 
0304             // will be used in the future for setting state
0305             auto const rmsForChecking = conditionsDev.pedestals_rms_x12()[hashedId];
0306 
0307             // this happens cause sometimes rms_x12 is 0...
0308             // needs to be checkec why this is the case
0309             // general case here is that noisecov is a Zero matrix
0310             if (rmsForChecking == 0) {
0311               acState[ch] = static_cast<char>(MinimizationState::Precomputed);
0312               flags[inputCh] = flag;
0313               continue;
0314             }
0315 
0316             // for the case when no shortcuts were taken
0317             flags[inputCh] = flag;
0318           }
0319         }
0320       }
0321     }
0322   };
0323 
0324   ///
0325   /// assume kernel launch configuration is
0326   /// ([MAXSAMPLES, MAXSAMPLES], nchannels)
0327   ///
0328   class Kernel_prep_2d {
0329   public:
0330     ALPAKA_FN_ACC void operator()(Acc2D const& acc,
0331                                   EcalDigiDeviceCollection::ConstView digisDevEB,
0332                                   EcalDigiDeviceCollection::ConstView digisDevEE,
0333                                   EcalMultifitConditionsDevice::ConstView conditionsDev,
0334                                   ::ecal::multifit::SampleGainVector const* gainsNoise,
0335                                   ::ecal::multifit::SampleMatrix* noisecov,
0336                                   ::ecal::multifit::PulseMatrixType* pulse_matrix,
0337                                   bool const* hasSwitchToGain6,
0338                                   bool const* hasSwitchToGain1,
0339                                   bool const* isSaturated) const {
0340       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0341       auto const offsetForHashes = conditionsDev.offsetEE();
0342       auto const nchannelsEB = digisDevEB.size();
0343       constexpr float addPedestalUncertainty = 0.f;
0344       constexpr bool dynamicPedestal = false;
0345       constexpr bool simplifiedNoiseModelForGainSwitch = true;  //---- default is true
0346 
0347       // pulse matrix
0348       auto const* pulse_shapes = reinterpret_cast<const EcalPulseShape*>(conditionsDev.pulseShapes()->data());
0349 
0350       auto const blockDimX = alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[1u];
0351       auto const elemsPerBlockX = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[1u];
0352       auto const elemsPerBlockY = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0353       Vec2D const size_2d = {elemsPerBlockY, blockDimX * elemsPerBlockX};  // {y, x} coordinates
0354 
0355       for (auto ndindex : cms::alpakatools::uniform_elements_nd(acc, size_2d)) {
0356         auto const ch = ndindex[1] / nsamples;
0357         auto const tx = ndindex[1] % nsamples;
0358         auto const ty = ndindex[0];
0359 
0360         // to access input arrays (ids and digis only)
0361         int const inputCh = ch >= nchannelsEB ? ch - nchannelsEB : ch;
0362         auto const* dids = ch >= nchannelsEB ? digisDevEE.id() : digisDevEB.id();
0363 
0364         auto const did = DetId{dids[inputCh]};
0365         auto const isBarrel = did.subdetId() == EcalBarrel;
0366         auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
0367                                        : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
0368         auto const* G12SamplesCorrelation = isBarrel ? conditionsDev.sampleCorrelation_EB_G12().data()
0369                                                      : conditionsDev.sampleCorrelation_EE_G12().data();
0370         auto const* G6SamplesCorrelation =
0371             isBarrel ? conditionsDev.sampleCorrelation_EB_G6().data() : conditionsDev.sampleCorrelation_EE_G6().data();
0372         auto const* G1SamplesCorrelation =
0373             isBarrel ? conditionsDev.sampleCorrelation_EB_G1().data() : conditionsDev.sampleCorrelation_EE_G1().data();
0374         auto const hasGainSwitch = hasSwitchToGain6[ch] || hasSwitchToGain1[ch] || isSaturated[ch];
0375 
0376         auto const vidx = std::abs(static_cast<int>(ty) - static_cast<int>(tx));
0377 
0378         // non-divergent branch for all threads per block
0379         if (hasGainSwitch) {
0380           // TODO: did not include simplified noise model
0381           float noise_value = 0;
0382 
0383           // non-divergent branch - all threads per block
0384           // TODO: all of these constants indicate that
0385           // that these parts could be splitted into completely different
0386           // kernels and run one of them only depending on the config
0387           if (simplifiedNoiseModelForGainSwitch) {
0388             constexpr int isample_max = 5;  // according to cpu defs
0389             auto const gainidx = gainsNoise[ch][isample_max];
0390 
0391             // non-divergent branches
0392             if (gainidx == 0) {
0393               auto const rms_x12 = conditionsDev.pedestals_rms_x12()[hashedId];
0394               noise_value = rms_x12 * rms_x12 * G12SamplesCorrelation[vidx];
0395             } else if (gainidx == 1) {
0396               auto const gain12Over6 = conditionsDev.gain12Over6()[hashedId];
0397               auto const rms_x6 = conditionsDev.pedestals_rms_x6()[hashedId];
0398               noise_value = gain12Over6 * gain12Over6 * rms_x6 * rms_x6 * G6SamplesCorrelation[vidx];
0399             } else if (gainidx == 2) {
0400               auto const gain12Over6 = conditionsDev.gain12Over6()[hashedId];
0401               auto const gain6Over1 = conditionsDev.gain6Over1()[hashedId];
0402               auto const gain12Over1 = gain12Over6 * gain6Over1;
0403               auto const rms_x1 = conditionsDev.pedestals_rms_x1()[hashedId];
0404               noise_value = gain12Over1 * gain12Over1 * rms_x1 * rms_x1 * G1SamplesCorrelation[vidx];
0405             }
0406             if (!dynamicPedestal && addPedestalUncertainty > 0.f)
0407               noise_value += addPedestalUncertainty * addPedestalUncertainty;
0408           } else {
0409             int gainidx = 0;
0410             char mask = gainidx;
0411             int pedestal = gainsNoise[ch][ty] == mask ? 1 : 0;
0412             //            NB: gainratio is 1, that is why it does not appear in the formula
0413             auto const rms_x12 = conditionsDev.pedestals_rms_x12()[hashedId];
0414             noise_value += rms_x12 * rms_x12 * pedestal * G12SamplesCorrelation[vidx];
0415             // non-divergent branch
0416             if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
0417               noise_value += addPedestalUncertainty * addPedestalUncertainty * pedestal;  // gainratio is 1
0418             }
0419 
0420             //
0421             gainidx = 1;
0422             mask = gainidx;
0423             pedestal = gainsNoise[ch][ty] == mask ? 1 : 0;
0424             auto const gain12Over6 = conditionsDev.gain12Over6()[hashedId];
0425             auto const rms_x6 = conditionsDev.pedestals_rms_x6()[hashedId];
0426             noise_value += gain12Over6 * gain12Over6 * rms_x6 * rms_x6 * pedestal * G6SamplesCorrelation[vidx];
0427             // non-divergent branch
0428             if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
0429               noise_value += gain12Over6 * gain12Over6 * addPedestalUncertainty * addPedestalUncertainty * pedestal;
0430             }
0431 
0432             //
0433             gainidx = 2;
0434             mask = gainidx;
0435             pedestal = gainsNoise[ch][ty] == mask ? 1 : 0;
0436             auto const gain6Over1 = conditionsDev.gain6Over1()[hashedId];
0437             auto const gain12Over1 = gain12Over6 * gain6Over1;
0438             auto const rms_x1 = conditionsDev.pedestals_rms_x1()[hashedId];
0439             noise_value += gain12Over1 * gain12Over1 * rms_x1 * rms_x1 * pedestal * G1SamplesCorrelation[vidx];
0440             // non-divergent branch
0441             if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
0442               noise_value += gain12Over1 * gain12Over1 * addPedestalUncertainty * addPedestalUncertainty * pedestal;
0443             }
0444           }
0445 
0446           noisecov[ch](ty, tx) = noise_value;
0447         } else {
0448           auto const rms = conditionsDev.pedestals_rms_x12()[hashedId];
0449           float noise_value = rms * rms * G12SamplesCorrelation[vidx];
0450           if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
0451             //----  add fully correlated component to noise covariance to inflate pedestal uncertainty
0452             noise_value += addPedestalUncertainty * addPedestalUncertainty;
0453           }
0454           noisecov[ch](ty, tx) = noise_value;
0455         }
0456 
0457         auto const posToAccess = 9 - static_cast<int>(tx) + static_cast<int>(ty);  // see cpu for reference
0458         float const value = posToAccess >= 7 ? pulse_shapes[hashedId].pdfval[posToAccess - 7] : 0;
0459         pulse_matrix[ch](ty, tx) = value;
0460       }
0461     }
0462   };
0463 
0464 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit
0465 
0466 namespace alpaka::trait {
0467   using namespace ALPAKA_ACCELERATOR_NAMESPACE;
0468   using namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit;
0469 
0470   //! The trait for getting the size of the block shared dynamic memory for Kernel_prep_1d_and_initialize.
0471   template <>
0472   struct BlockSharedMemDynSizeBytes<Kernel_prep_1d_and_initialize, Acc1D> {
0473     //! \return The size of the shared memory allocated for a block.
0474     template <typename TVec, typename... TArgs>
0475     ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_prep_1d_and_initialize const&,
0476                                                                  TVec const& threadsPerBlock,
0477                                                                  TVec const& elemsPerThread,
0478                                                                  TArgs const&...) -> std::size_t {
0479       // return the amount of dynamic shared memory needed
0480       std::size_t bytes = threadsPerBlock[0u] * elemsPerThread[0u] * (5 * sizeof(bool) + sizeof(char));
0481       return bytes;
0482     }
0483   };
0484 }  // namespace alpaka::trait
0485 
0486 #endif  // RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationCommonKernels_h