Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-20 02:32:08

0001 #ifndef RecoLocalCalo_EcalRecProducers_plugins_alpaka_TimeComputationKernels_h
0002 #define RecoLocalCalo_EcalRecProducers_plugins_alpaka_TimeComputationKernels_h
0003 
0004 #include <cassert>
0005 #include <cmath>
0006 #include <limits>
0007 
0008 #include "CondFormats/EcalObjects/interface/alpaka/EcalMultifitConditionsDevice.h"
0009 #include "CondFormats/EcalObjects/interface/alpaka/EcalMultifitParametersDevice.h"
0010 #include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
0011 #include "DataFormats/EcalDigi/interface/EcalMGPASample.h"
0012 #include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
0013 #include "DataFormats/Math/interface/approx_exp.h"
0014 #include "DataFormats/Math/interface/approx_log.h"
0015 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0016 #include "RecoLocalCalo/EcalRecProducers/interface/EigenMatrixTypes_gpu.h"
0017 
0018 #include "DeclsForKernels.h"
0019 #include "KernelHelpers.h"
0020 
0021 //#define ECAL_RECO_ALPAKA_DEBUG
0022 
0023 namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {
0024 
0025   ALPAKA_FN_ACC ALPAKA_FN_INLINE bool use_sample(unsigned int sample_mask, unsigned int sample) {
0026     return sample_mask & (0x1 << (EcalDataFrame::MAXSAMPLES - (sample + 1)));
0027   }
0028 
0029   ALPAKA_FN_ACC constexpr float fast_expf(float x) { return unsafe_expf<6>(x); }
0030   ALPAKA_FN_ACC constexpr float fast_logf(float x) { return unsafe_logf<7>(x); }
0031 
0032   class Kernel_time_compute_nullhypot {
0033     using ScalarType = ::ecal::multifit::SampleVector::Scalar;
0034 
0035   public:
0036     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0037     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0038                                   ScalarType* const sample_values,
0039                                   ScalarType* const sample_value_errors,
0040                                   bool* const useless_sample_values,
0041                                   ScalarType* chi2s,
0042                                   ScalarType* sum0s,
0043                                   ScalarType* sumAAs,
0044                                   uint32_t const nchannels) const {
0045       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0046 
0047       // indices
0048       auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0049 
0050       // shared mem inits
0051       auto* s_sum0 = alpaka::getDynSharedMem<char>(acc);
0052       auto* s_sum1 = reinterpret_cast<ScalarType*>(s_sum0 + elemsPerBlock);
0053       auto* s_sumA = s_sum1 + elemsPerBlock;
0054       auto* s_sumAA = s_sumA + elemsPerBlock;
0055 
0056       for (auto txforward : cms::alpakatools::uniform_elements(acc, nchannels * nsamples)) {
0057         // go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution
0058         auto tx = nchannels * nsamples - 1 - txforward;
0059         auto const ch = tx / nsamples;
0060 
0061         auto const sample = tx % nsamples;
0062         auto const ltx = tx % elemsPerBlock;
0063 
0064         // TODO make sure no div by 0
0065         auto const inv_error =
0066             useless_sample_values[tx] ? 0. : 1. / (sample_value_errors[tx] * sample_value_errors[tx]);
0067         auto const sample_value = sample_values[tx];
0068         s_sum0[ltx] = useless_sample_values[tx] ? 0 : 1;
0069         s_sum1[ltx] = inv_error;
0070         s_sumA[ltx] = sample_value * inv_error;
0071         s_sumAA[ltx] = sample_value * sample_value * inv_error;
0072         alpaka::syncBlockThreads(acc);
0073 
0074         // 5 threads for [0, 4] samples
0075         if (sample < 5) {
0076           s_sum0[ltx] += s_sum0[ltx + 5];
0077           s_sum1[ltx] += s_sum1[ltx + 5];
0078           s_sumA[ltx] += s_sumA[ltx + 5];
0079           s_sumAA[ltx] += s_sumAA[ltx + 5];
0080         }
0081         alpaka::syncBlockThreads(acc);
0082 
0083         if (sample < 2) {
0084           // note double counting of sample 3
0085           s_sum0[ltx] += s_sum0[ltx + 2] + s_sum0[ltx + 3];
0086           s_sum1[ltx] += s_sum1[ltx + 2] + s_sum1[ltx + 3];
0087           s_sumA[ltx] += s_sumA[ltx + 2] + s_sumA[ltx + 3];
0088           s_sumAA[ltx] += s_sumAA[ltx + 2] + s_sumAA[ltx + 3];
0089         }
0090         alpaka::syncBlockThreads(acc);
0091 
0092         if (sample == 0) {
0093           // note, subtract to remove the double counting of sample == 3
0094           auto const sum0 = s_sum0[ltx] + s_sum0[ltx + 1] - s_sum0[ltx + 3];
0095           auto const sum1 = s_sum1[ltx] + s_sum1[ltx + 1] - s_sum1[ltx + 3];
0096           auto const sumA = s_sumA[ltx] + s_sumA[ltx + 1] - s_sumA[ltx + 3];
0097           auto const sumAA = s_sumAA[ltx] + s_sumAA[ltx + 1] - s_sumAA[ltx + 3];
0098           auto const chi2 = sum0 > 0 ? (sumAA - sumA * sumA / sum1) / sum0 : static_cast<ScalarType>(0);
0099           chi2s[ch] = chi2;
0100           sum0s[ch] = sum0;
0101           sumAAs[ch] = sumAA;
0102 
0103 #ifdef DEBUG_TC_NULLHYPOT
0104           if (ch == 0) {
0105             printf("chi2 = %f sum0 = %d sumAA = %f\n", chi2, static_cast<int>(sum0), sumAA);
0106           }
0107 #endif
0108         }
0109       }
0110     }
0111   };
0112 
0113   //
0114   // launch ctx parameters are
0115   // 45 threads per channel, X channels per block, Y blocks
0116   // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9
0117   // TODO: it might be much beter to use 32 threads per channel instead of 45
0118   // to simplify the synchronization
0119   class Kernel_time_compute_makeratio {
0120     using ScalarType = ::ecal::multifit::SampleVector::Scalar;
0121 
0122   public:
0123     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0124     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0125                                   EcalDigiDeviceCollection::ConstView digisDevEB,
0126                                   EcalDigiDeviceCollection::ConstView digisDevEE,
0127                                   ScalarType* const sample_values,
0128                                   ScalarType* const sample_value_errors,
0129                                   bool* const useless_sample_values,
0130                                   char* const pedestal_nums,
0131                                   ScalarType* const sumAAsNullHypot,
0132                                   ScalarType* const sum0sNullHypot,
0133                                   ScalarType* tMaxAlphaBetas,
0134                                   ScalarType* tMaxErrorAlphaBetas,
0135                                   ScalarType* g_accTimeMax,
0136                                   ScalarType* g_accTimeWgt,
0137                                   TimeComputationState* g_state,
0138                                   EcalMultifitParametersDevice::ConstView paramsDev,
0139                                   ConfigurationParameters::type const timeFitLimits_firstEB,
0140                                   ConfigurationParameters::type const timeFitLimits_firstEE,
0141                                   ConfigurationParameters::type const timeFitLimits_secondEB,
0142                                   ConfigurationParameters::type const timeFitLimits_secondEE) const {
0143       // constants
0144       constexpr uint32_t nchannels_per_block = 10;
0145       constexpr auto nthreads_per_channel = nchannels_per_block * (nchannels_per_block - 1) / 2;
0146       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0147       auto const nchannels = digisDevEB.size() + digisDevEE.size();
0148       auto const offsetForInputs = digisDevEB.size();
0149       auto const totalElements = nthreads_per_channel * nchannels;
0150 
0151       auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0152       assert(nthreads_per_channel * nchannels_per_block == elemsPerBlock);
0153 
0154       auto* shr_chi2s = alpaka::getDynSharedMem<ScalarType>(acc);
0155       auto* shr_time_wgt = shr_chi2s + elemsPerBlock;
0156       auto* shr_time_max = shr_time_wgt + elemsPerBlock;
0157       auto* shrTimeMax = shr_time_max + elemsPerBlock;
0158       auto* shrTimeWgt = shrTimeMax + elemsPerBlock;
0159       auto* shr_chi2 = shrTimeWgt + elemsPerBlock;
0160       auto* shr_tmax = shr_chi2 + elemsPerBlock;
0161       auto* shr_tmaxerr = shr_tmax + elemsPerBlock;
0162       auto* shr_condForUselessSamples = reinterpret_cast<bool*>(shr_tmaxerr + elemsPerBlock);
0163       auto* shr_internalCondForSkipping1 = shr_condForUselessSamples + elemsPerBlock;
0164       auto* shr_internalCondForSkipping2 = shr_internalCondForSkipping1 + elemsPerBlock;
0165 
0166       for (auto block : cms::alpakatools::uniform_groups(acc, totalElements)) {
0167         for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0168           auto const ch = idx.global / nthreads_per_channel;
0169           auto const ltx = idx.global % nthreads_per_channel;
0170 
0171           auto const ch_start = ch * nsamples;
0172           auto const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
0173           auto const* dids = ch >= offsetForInputs ? digisDevEE.id() : digisDevEB.id();
0174 
0175           auto const did = DetId{dids[inputCh]};
0176           auto const isBarrel = did.subdetId() == EcalBarrel;
0177           auto* const amplitudeFitParameters =
0178               isBarrel ? paramsDev.amplitudeFitParamsEB().data() : paramsDev.amplitudeFitParamsEE().data();
0179           auto* const timeFitParameters =
0180               isBarrel ? paramsDev.timeFitParamsEB().data() : paramsDev.timeFitParamsEE().data();
0181           auto const timeFitParameters_size =
0182               isBarrel ? paramsDev.timeFitParamsEB().size() : paramsDev.timeFitParamsEE().size();
0183           auto const timeFitLimits_first = isBarrel ? timeFitLimits_firstEB : timeFitLimits_firstEE;
0184           auto const timeFitLimits_second = isBarrel ? timeFitLimits_secondEB : timeFitLimits_secondEE;
0185 
0186           // map tx -> (sample_i, sample_j)
0187           int sample_i = 0;
0188           int sample_j = 0;
0189           if (ltx <= 8) {
0190             sample_i = 0;
0191             sample_j = 1 + ltx;
0192           } else if (ltx <= 16) {
0193             sample_i = 1;
0194             sample_j = 2 + ltx - 9;
0195           } else if (ltx <= 23) {
0196             sample_i = 2;
0197             sample_j = 3 + ltx - 17;
0198           } else if (ltx <= 29) {
0199             sample_i = 3;
0200             sample_j = 4 + ltx - 24;
0201           } else if (ltx <= 34) {
0202             sample_i = 4;
0203             sample_j = 5 + ltx - 30;
0204           } else if (ltx <= 38) {
0205             sample_i = 5;
0206             sample_j = 6 + ltx - 35;
0207           } else if (ltx <= 41) {
0208             sample_i = 6;
0209             sample_j = 7 + ltx - 39;
0210           } else if (ltx <= 43) {
0211             sample_i = 7;
0212             sample_j = 8 + ltx - 42;
0213           } else if (ltx <= 44) {
0214             sample_i = 8;
0215             sample_j = 9;
0216           } else
0217             assert(false);
0218 
0219           auto const tx_i = ch_start + sample_i;
0220           auto const tx_j = ch_start + sample_j;
0221 
0222           //
0223           // note, given the way we partition the block, with 45 threads per channel
0224           // we will end up with inactive threads which need to be dragged along
0225           // through the synching point
0226           //
0227           bool const condForUselessSamples = useless_sample_values[tx_i] || useless_sample_values[tx_j] ||
0228                                              sample_values[tx_i] <= 1 || sample_values[tx_j] <= 1;
0229 
0230           //
0231           // see cpu implementation for explanation
0232           //
0233           ScalarType chi2 = std::numeric_limits<ScalarType>::max();
0234           ScalarType tmax = 0;
0235           ScalarType tmaxerr = 0;
0236           shrTimeMax[idx.local] = 0;
0237           shrTimeWgt[idx.local] = 0;
0238 
0239           bool internalCondForSkipping1 = true;
0240           bool internalCondForSkipping2 = true;
0241           if (!condForUselessSamples) {
0242             auto const rtmp = sample_values[tx_i] / sample_values[tx_j];
0243             auto const invampl_i = 1. / sample_values[tx_i];
0244             auto const relErr2_i = sample_value_errors[tx_i] * sample_value_errors[tx_i] * invampl_i * invampl_i;
0245             auto const invampl_j = 1. / sample_values[tx_j];
0246             auto const relErr2_j = sample_value_errors[tx_j] * sample_value_errors[tx_j] * invampl_j * invampl_j;
0247             auto const err1 = rtmp * rtmp * (relErr2_i + relErr2_j);
0248             auto err2 =
0249                 sample_value_errors[tx_j] * (sample_values[tx_i] - sample_values[tx_j]) * (invampl_j * invampl_j);
0250             // TODO non-divergent branch for a block if each block has 1 channel
0251             // otherwise non-divergent for groups of 45 threads
0252             // at this point, pedestal_nums[ch] can be either 0, 1 or 2
0253             if (pedestal_nums[ch] == 2)
0254               err2 *= err2 * 0.5;
0255             auto const err3 = (0.289 * 0.289) * (invampl_j * invampl_j);
0256             auto const total_error = std::sqrt(err1 + err2 + err3);
0257 
0258             auto const alpha = amplitudeFitParameters[0];
0259             auto const beta = amplitudeFitParameters[1];
0260             auto const alphabeta = alpha * beta;
0261             auto const invalphabeta = 1. / alphabeta;
0262 
0263             // variables instead of a struct
0264             auto const ratio_index = sample_i;
0265             auto const ratio_step = sample_j - sample_i;
0266             auto const ratio_value = rtmp;
0267             auto const ratio_error = total_error;
0268 
0269             auto const rlim_i_j = fast_expf(static_cast<ScalarType>(sample_j - sample_i) / beta) - 0.001;
0270             internalCondForSkipping1 = !(total_error < 1. && rtmp > 0.001 && rtmp < rlim_i_j);
0271             if (!internalCondForSkipping1) {
0272               //
0273               // precompute.
0274               // in cpu version this was done conditionally
0275               // however easier to do it here (precompute) and then just filter out
0276               // if not needed
0277               //
0278               auto const l_timeFitLimits_first = timeFitLimits_first;
0279               auto const l_timeFitLimits_second = timeFitLimits_second;
0280               if (ratio_step == 1 && ratio_value >= l_timeFitLimits_first && ratio_value <= l_timeFitLimits_second) {
0281                 auto const time_max_i = static_cast<ScalarType>(ratio_index);
0282                 auto u = timeFitParameters[timeFitParameters_size - 1];
0283                 CMS_UNROLL_LOOP
0284                 for (int k = timeFitParameters_size - 2; k >= 0; --k)
0285                   u = u * ratio_value + timeFitParameters[k];
0286 
0287                 auto du = (timeFitParameters_size - 1) * (timeFitParameters[timeFitParameters_size - 1]);
0288                 for (int k = timeFitParameters_size - 2; k >= 1; --k)
0289                   du = du * ratio_value + k * timeFitParameters[k];
0290 
0291                 auto const error2 = ratio_error * ratio_error * du * du;
0292                 auto const time_max = error2 > 0 ? (time_max_i - u) / error2 : static_cast<ScalarType>(0);
0293                 auto const time_wgt = error2 > 0 ? 1. / error2 : static_cast<ScalarType>(0);
0294 
0295                 // store into shared mem
0296                 // note, this name is essentially identical to the one used
0297                 // below.
0298                 shrTimeMax[idx.local] = error2 > 0 ? time_max : 0;
0299                 shrTimeWgt[idx.local] = error2 > 0 ? time_wgt : 0;
0300               } else {
0301                 shrTimeMax[idx.local] = 0;
0302                 shrTimeWgt[idx.local] = 0;
0303               }
0304 
0305               // continue with ratios
0306               auto const stepOverBeta = static_cast<ScalarType>(ratio_step) / beta;
0307               auto const offset = static_cast<ScalarType>(ratio_index) + alphabeta;
0308               auto const rmin = std::max(ratio_value - ratio_error, 0.001);
0309               auto const rmax =
0310                   std::min(ratio_value + ratio_error, fast_expf(static_cast<ScalarType>(ratio_step) / beta) - 0.001);
0311               auto const time1 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmin)) / alpha) - 1.);
0312               auto const time2 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmax)) / alpha) - 1.);
0313 
0314               // set these guys
0315               tmax = 0.5 * (time1 + time2);
0316               tmaxerr = 0.5 * std::sqrt((time1 - time2) * (time1 - time2));
0317 #ifdef DEBUG_TC_MAKERATIO
0318               if (ch == 1 || ch == 0)
0319                 printf(
0320                     "ch = %d ltx = %d tmax = %f tmaxerr = %f time1 = %f time2 = %f offset = %f rmin = %f rmax = "
0321                     "%f\n",
0322                     ch,
0323                     ltx,
0324                     tmax,
0325                     tmaxerr,
0326                     time1,
0327                     time2,
0328                     offset,
0329                     rmin,
0330                     rmax);
0331 #endif
0332 
0333               ScalarType sumAf = 0;
0334               ScalarType sumff = 0;
0335               const int itmin = std::max(-1, static_cast<int>(std::floor(tmax - alphabeta)));
0336               auto loffset = (static_cast<ScalarType>(itmin) - tmax) * invalphabeta;
0337               // TODO: data dependence
0338               for (int it = itmin + 1; it < nsamples; ++it) {
0339                 loffset += invalphabeta;
0340                 if (useless_sample_values[ch_start + it])
0341                   continue;
0342                 auto const inverr2 = 1. / (sample_value_errors[ch_start + it] * sample_value_errors[ch_start + it]);
0343                 auto const term1 = 1. + loffset;
0344                 auto const f = (term1 > 1e-6) ? fast_expf(alpha * (fast_logf(term1) - loffset)) : 0;
0345                 sumAf += sample_values[ch_start + it] * (f * inverr2);
0346                 sumff += f * (f * inverr2);
0347               }
0348 
0349               auto const sumAA = sumAAsNullHypot[ch];
0350               auto const sum0 = sum0sNullHypot[ch];
0351               chi2 = sumAA;
0352               // TODO: sum0 can not be 0 below, need to introduce the check upfront
0353               if (sumff > 0) {
0354                 chi2 = sumAA - sumAf * (sumAf / sumff);
0355               }
0356               chi2 /= sum0;
0357 
0358 #ifdef DEBUG_TC_MAKERATIO
0359               if (ch == 1 || ch == 0)
0360                 printf(
0361                     "ch = %d ltx = %d sumAf = %f sumff = %f sumAA = %f sum0 = %d tmax = %f tmaxerr = %f chi2 = "
0362                     "%f\n",
0363                     ch,
0364                     ltx,
0365                     sumAf,
0366                     sumff,
0367                     sumAA,
0368                     static_cast<int>(sum0),
0369                     tmax,
0370                     tmaxerr,
0371                     chi2);
0372 #endif
0373 
0374               if (chi2 > 0 && tmax > 0 && tmaxerr > 0)
0375                 internalCondForSkipping2 = false;
0376               else
0377                 chi2 = std::numeric_limits<ScalarType>::max();
0378             }
0379           }
0380 
0381           // store into smem
0382           shr_chi2s[idx.local] = chi2;
0383           shr_chi2[idx.local] = chi2;
0384           shr_tmax[idx.local] = tmax;
0385           shr_tmaxerr[idx.local] = tmaxerr;
0386           shr_condForUselessSamples[idx.local] = condForUselessSamples;
0387           shr_internalCondForSkipping1[idx.local] = internalCondForSkipping1;
0388           shr_internalCondForSkipping2[idx.local] = internalCondForSkipping2;
0389         }
0390 
0391         alpaka::syncBlockThreads(acc);
0392 
0393         // find min chi2 - quite crude for now
0394         // TODO validate/check
0395         auto iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
0396         bool oddElements = nthreads_per_channel % 2;
0397         CMS_UNROLL_LOOP
0398         while (iter >= 1) {
0399           for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0400             auto const ltx = idx.global % nthreads_per_channel;
0401 
0402             if (ltx < iter && !(oddElements && (ltx == iter - 1 && ltx > 0))) {
0403               // for odd ns, the last guy will just store itself
0404               // exception is for ltx == 0 and iter==1
0405               shr_chi2s[idx.local] = std::min(shr_chi2s[idx.local], shr_chi2s[idx.local + iter]);
0406             }
0407           }
0408           alpaka::syncBlockThreads(acc);
0409 
0410           oddElements = iter % 2;
0411           iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
0412         }
0413 
0414         for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0415           auto const ltx = idx.global % nthreads_per_channel;
0416 
0417           // get precomputedflags for this element from shared memory
0418           auto const condForUselessSamples = shr_condForUselessSamples[idx.local];
0419           auto const internalCondForSkipping1 = shr_internalCondForSkipping1[idx.local];
0420           auto const internalCondForSkipping2 = shr_internalCondForSkipping2[idx.local];
0421           // filter out inactive or useless samples threads
0422           if (!condForUselessSamples && !internalCondForSkipping1 && !internalCondForSkipping2) {
0423             // min chi2, now compute weighted average of tmax measurements
0424             // see cpu version for more explanation
0425             auto const chi2 = shr_chi2[idx.local];
0426             auto const chi2min = shr_chi2s[idx.local - ltx];
0427             auto const chi2Limit = chi2min + 1.;
0428             auto const tmaxerr = shr_tmaxerr[idx.local];
0429             auto const inverseSigmaSquared = chi2 < chi2Limit ? 1. / (tmaxerr * tmaxerr) : 0.;
0430 
0431 #ifdef DEBUG_TC_MAKERATIO
0432             if (ch == 1 || ch == 0) {
0433               auto const ch = idx.global / nthreads_per_channel;
0434               printf("ch = %d ltx = %d chi2min = %f chi2Limit = %f inverseSigmaSquared = %f\n",
0435                      ch,
0436                      ltx,
0437                      chi2min,
0438                      chi2Limit,
0439                      inverseSigmaSquared);
0440             }
0441 #endif
0442 
0443             // store into shared mem and run reduction
0444             // TODO: check if cooperative groups would be better
0445             // TODO: check if shuffling intrinsics are better
0446             auto const tmax = shr_tmax[idx.local];
0447             shr_time_wgt[idx.local] = inverseSigmaSquared;
0448             shr_time_max[idx.local] = tmax * inverseSigmaSquared;
0449           } else {
0450             shr_time_wgt[idx.local] = 0;
0451             shr_time_max[idx.local] = 0;
0452           }
0453         }
0454 
0455         alpaka::syncBlockThreads(acc);
0456 
0457         // reduce to compute time_max and time_wgt
0458         iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
0459         oddElements = nthreads_per_channel % 2;
0460         CMS_UNROLL_LOOP
0461         while (iter >= 1) {
0462           for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0463             auto const ltx = idx.global % nthreads_per_channel;
0464 
0465             if (ltx < iter && !(oddElements && (ltx == iter - 1 && ltx > 0))) {
0466               shr_time_wgt[idx.local] += shr_time_wgt[idx.local + iter];
0467               shr_time_max[idx.local] += shr_time_max[idx.local + iter];
0468               shrTimeMax[idx.local] += shrTimeMax[idx.local + iter];
0469               shrTimeWgt[idx.local] += shrTimeWgt[idx.local + iter];
0470             }
0471           }
0472 
0473           alpaka::syncBlockThreads(acc);
0474           oddElements = iter % 2;
0475           iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
0476         }
0477 
0478         for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0479           auto const ltx = idx.global % nthreads_per_channel;
0480 
0481           // load from shared memory the 0th guy (will contain accumulated values)
0482           // compute
0483           // store into global mem
0484           if (ltx == 0) {
0485             auto const ch = idx.global / nthreads_per_channel;
0486             auto const tmp_time_max = shr_time_max[idx.local];
0487             auto const tmp_time_wgt = shr_time_wgt[idx.local];
0488 
0489             // we are done if there number of time ratios is 0
0490             if (tmp_time_wgt == 0 && tmp_time_max == 0) {
0491               g_state[ch] = TimeComputationState::Finished;
0492               continue;
0493             }
0494 
0495             // no div by 0
0496             auto const tMaxAlphaBeta = tmp_time_max / tmp_time_wgt;
0497             auto const tMaxErrorAlphaBeta = 1. / std::sqrt(tmp_time_wgt);
0498 
0499             tMaxAlphaBetas[ch] = tMaxAlphaBeta;
0500             tMaxErrorAlphaBetas[ch] = tMaxErrorAlphaBeta;
0501             g_accTimeMax[ch] = shrTimeMax[idx.local];
0502             g_accTimeWgt[ch] = shrTimeWgt[idx.local];
0503             g_state[ch] = TimeComputationState::NotFinished;
0504 
0505 #ifdef DEBUG_TC_MAKERATIO
0506             printf("ch = %d time_max = %f time_wgt = %f\n", ch, tmp_time_max, tmp_time_wgt);
0507             printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f timeMax = %f timeWgt = %f\n",
0508                    ch,
0509                    tMaxAlphaBeta,
0510                    tMaxErrorAlphaBeta,
0511                    shrTimeMax[idx.local],
0512                    shrTimeWgt[idx.local]);
0513 #endif
0514           }
0515         }
0516       }
0517     }
0518   };
0519 
0520   class Kernel_time_compute_findamplchi2_and_finish {
0521     using ScalarType = ::ecal::multifit::SampleVector::Scalar;
0522 
0523   public:
0524     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0525     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0526                                   EcalDigiDeviceCollection::ConstView digisDevEB,
0527                                   EcalDigiDeviceCollection::ConstView digisDevEE,
0528                                   ScalarType* const sample_values,
0529                                   ScalarType* const sample_value_errors,
0530                                   bool* const useless_samples,
0531                                   ScalarType* const g_tMaxAlphaBeta,
0532                                   ScalarType* const g_tMaxErrorAlphaBeta,
0533                                   ScalarType* const g_accTimeMax,
0534                                   ScalarType* const g_accTimeWgt,
0535                                   ScalarType* const sumAAsNullHypot,
0536                                   ScalarType* const sum0sNullHypot,
0537                                   ScalarType* const chi2sNullHypot,
0538                                   TimeComputationState* g_state,
0539                                   ScalarType* g_ampMaxAlphaBeta,
0540                                   ScalarType* g_ampMaxError,
0541                                   ScalarType* g_timeMax,
0542                                   ScalarType* g_timeError,
0543                                   EcalMultifitParametersDevice::ConstView paramsDev) const {
0544       /// launch ctx parameters are
0545       /// 10 threads per channel, N channels per block, Y blocks
0546       /// TODO: do we need to keep the state around or can be removed?!
0547       //#define DEBUG_FINDAMPLCHI2_AND_FINISH
0548 
0549       // constants
0550       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0551       auto const nchannels = digisDevEB.size() + digisDevEE.size();
0552       auto const offsetForInputs = digisDevEB.size();
0553 
0554       auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0555 
0556       // configure shared mem
0557       // per block, we need #threads per block * 2 * sizeof(ScalarType)
0558       // we run with N channels per block
0559       auto* shr_sumAf = alpaka::getDynSharedMem<ScalarType>(acc);
0560       auto* shr_sumff = shr_sumAf + elemsPerBlock;
0561 
0562       for (auto gtxforward : cms::alpakatools::uniform_elements(acc, nchannels * nsamples)) {
0563         // go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution
0564         auto gtx = nchannels * nsamples - 1 - gtxforward;
0565         auto const ch = gtx / nsamples;
0566         auto const elemIdx = gtx % elemsPerBlock;
0567         auto const sample = elemIdx % nsamples;
0568 
0569         auto const* dids = ch >= offsetForInputs ? digisDevEE.id() : digisDevEB.id();
0570         auto const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
0571 
0572         auto state = g_state[ch];
0573         auto const did = DetId{dids[inputCh]};
0574         auto* const amplitudeFitParameters = did.subdetId() == EcalBarrel ? paramsDev.amplitudeFitParamsEB().data()
0575                                                                           : paramsDev.amplitudeFitParamsEE().data();
0576 
0577         // TODO is that better than storing into global and launching another kernel
0578         // for the first 10 threads
0579         if (state == TimeComputationState::NotFinished) {
0580           auto const alpha = amplitudeFitParameters[0];
0581           auto const beta = amplitudeFitParameters[1];
0582           auto const alphabeta = alpha * beta;
0583           auto const invalphabeta = 1. / alphabeta;
0584           auto const tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
0585           auto const sample_value = sample_values[gtx];
0586           auto const sample_value_error = sample_value_errors[gtx];
0587           auto const inverr2 =
0588               useless_samples[gtx] ? static_cast<ScalarType>(0) : 1. / (sample_value_error * sample_value_error);
0589           auto const offset = (static_cast<ScalarType>(sample) - tMaxAlphaBeta) * invalphabeta;
0590           auto const term1 = 1. + offset;
0591           auto const f = term1 > 1e-6 ? fast_expf(alpha * (fast_logf(term1) - offset)) : static_cast<ScalarType>(0.);
0592           auto const sumAf = sample_value * (f * inverr2);
0593           auto const sumff = f * (f * inverr2);
0594 
0595           // store into shared mem
0596           shr_sumAf[elemIdx] = sumAf;
0597           shr_sumff[elemIdx] = sumff;
0598         } else {
0599           shr_sumAf[elemIdx] = 0;
0600           shr_sumff[elemIdx] = 0;
0601         }
0602 
0603         alpaka::syncBlockThreads(acc);
0604 
0605         // reduce
0606         // unroll completely here (but hardcoded)
0607         if (sample < 5) {
0608           shr_sumAf[elemIdx] += shr_sumAf[elemIdx + 5];
0609           shr_sumff[elemIdx] += shr_sumff[elemIdx + 5];
0610         }
0611 
0612         alpaka::syncBlockThreads(acc);
0613 
0614         if (sample < 2) {
0615           // will need to subtract for ltx = 3, we double count here
0616           shr_sumAf[elemIdx] += shr_sumAf[elemIdx + 2] + shr_sumAf[elemIdx + 3];
0617           shr_sumff[elemIdx] += shr_sumff[elemIdx + 2] + shr_sumff[elemIdx + 3];
0618         }
0619 
0620         alpaka::syncBlockThreads(acc);
0621 
0622         if (sample == 0) {
0623           // exit if the state is done
0624           // note, we do not exit before all __synchtreads are finished
0625           if (state == TimeComputationState::Finished) {
0626             g_timeMax[ch] = 5;
0627             g_timeError[ch] = -999;
0628             continue;
0629           }
0630 
0631           // subtract to avoid double counting
0632           auto const sumff = shr_sumff[elemIdx] + shr_sumff[elemIdx + 1] - shr_sumff[elemIdx + 3];
0633           auto const sumAf = shr_sumAf[elemIdx] + shr_sumAf[elemIdx + 1] - shr_sumAf[elemIdx + 3];
0634 
0635           auto const ampMaxAlphaBeta = sumff > 0 ? sumAf / sumff : 0;
0636           auto const sumAA = sumAAsNullHypot[ch];
0637           auto const sum0 = sum0sNullHypot[ch];
0638           auto const nullChi2 = chi2sNullHypot[ch];
0639           if (sumff > 0) {
0640             auto const chi2AlphaBeta = (sumAA - sumAf * sumAf / sumff) / sum0;
0641             if (chi2AlphaBeta > nullChi2) {
0642               // null hypothesis is better
0643               state = TimeComputationState::Finished;
0644 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0645               printf("ch = %d chi2AlphaBeta = %f nullChi2 = %f sumAA = %f sumAf = %f sumff = %f sum0 = %f\n",
0646                      ch,
0647                      chi2AlphaBeta,
0648                      nullChi2,
0649                      sumAA,
0650                      sumAf,
0651                      sumff,
0652                      sum0);
0653 #endif
0654             }
0655 
0656             // store to global
0657             g_ampMaxAlphaBeta[ch] = ampMaxAlphaBeta;
0658           } else {
0659 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0660             printf("ch = %d sum0 = %f sumAA = %f sumff = %f sumAf = %f\n", ch, sum0, sumAA, sumff, sumAf);
0661 #endif
0662             state = TimeComputationState::Finished;
0663           }
0664 
0665           // store the state to global and finish calcs
0666           g_state[ch] = state;
0667           if (state == TimeComputationState::Finished) {
0668             // store default values into global
0669             g_timeMax[ch] = 5;
0670             g_timeError[ch] = -999;
0671 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0672             printf("ch = %d finished state\n", ch);
0673 #endif
0674             continue;
0675           }
0676 
0677           auto const ampMaxError = g_ampMaxError[ch];
0678           auto const test_ratio = ampMaxAlphaBeta / ampMaxError;
0679           auto const accTimeMax = g_accTimeMax[ch];
0680           auto const accTimeWgt = g_accTimeWgt[ch];
0681           auto const tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
0682           auto const tMaxErrorAlphaBeta = g_tMaxErrorAlphaBeta[ch];
0683           // branch to separate large vs small pulses
0684           // see cpu version for more info
0685           if (test_ratio > 5. && accTimeWgt > 0) {
0686             auto const tMaxRatio = accTimeWgt > 0 ? accTimeMax / accTimeWgt : static_cast<ScalarType>(0);
0687             auto const tMaxErrorRatio = accTimeWgt > 0 ? 1. / std::sqrt(accTimeWgt) : static_cast<ScalarType>(0);
0688 
0689             if (test_ratio > 10.) {
0690               g_timeMax[ch] = tMaxRatio;
0691               g_timeError[ch] = tMaxErrorRatio;
0692 
0693 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0694               printf("ch = %d tMaxRatio = %f tMaxErrorRatio = %f\n", ch, tMaxRatio, tMaxErrorRatio);
0695 #endif
0696             } else {
0697               auto const timeMax = (tMaxAlphaBeta * (10. - ampMaxAlphaBeta / ampMaxError) +
0698                                     tMaxRatio * (ampMaxAlphaBeta / ampMaxError - 5.)) /
0699                                    5.;
0700               auto const timeError = (tMaxErrorAlphaBeta * (10. - ampMaxAlphaBeta / ampMaxError) +
0701                                       tMaxErrorRatio * (ampMaxAlphaBeta / ampMaxError - 5.)) /
0702                                      5.;
0703               state = TimeComputationState::Finished;
0704               g_state[ch] = state;
0705               g_timeMax[ch] = timeMax;
0706               g_timeError[ch] = timeError;
0707 
0708 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0709               printf("ch = %d timeMax = %f timeError = %f\n", ch, timeMax, timeError);
0710 #endif
0711             }
0712           } else {
0713             state = TimeComputationState::Finished;
0714             g_state[ch] = state;
0715             g_timeMax[ch] = tMaxAlphaBeta;
0716             g_timeError[ch] = tMaxErrorAlphaBeta;
0717 
0718 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0719             printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f\n", ch, tMaxAlphaBeta, tMaxErrorAlphaBeta);
0720 #endif
0721           }
0722         }
0723       }
0724     }
0725   };
0726 
0727   class Kernel_time_compute_fixMGPAslew {
0728     using ScalarType = ::ecal::multifit::SampleVector::Scalar;
0729 
0730   public:
0731     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0732     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0733                                   EcalDigiDeviceCollection::ConstView digisDevEB,
0734                                   EcalDigiDeviceCollection::ConstView digisDevEE,
0735                                   EcalMultifitConditionsDevice::ConstView conditionsDev,
0736                                   ScalarType* sample_values,
0737                                   ScalarType* sample_value_errors,
0738                                   bool* useless_sample_values) const {
0739       // constants
0740       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0741 
0742       auto const nchannelsEB = digisDevEB.size();
0743       auto const offsetForInputs = nchannelsEB;
0744 
0745       auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0746 
0747       for (auto gtx : cms::alpakatools::uniform_elements(acc, nchannelsEB * nsamples)) {
0748         auto const elemIdx = gtx % elemsPerBlock;
0749         auto const sample = elemIdx % nsamples;
0750         auto const ch = gtx / nsamples;
0751 
0752         // remove thread for sample 0, oversubscribing is easier than ....
0753         if (sample == 0)
0754           continue;
0755 
0756         if (!use_sample(conditionsDev.sampleMask_EB(), sample))
0757           continue;
0758 
0759         int const inputGtx = ch >= offsetForInputs ? gtx - offsetForInputs * nsamples : gtx;
0760         auto const* digis = ch >= offsetForInputs ? digisDevEE.data()->data() : digisDevEB.data()->data();
0761 
0762         auto const gainIdPrev = ecalMGPA::gainId(digis[inputGtx - 1]);
0763         auto const gainIdNext = ecalMGPA::gainId(digis[inputGtx]);
0764         if (gainIdPrev >= 1 && gainIdPrev <= 3 && gainIdNext >= 1 && gainIdNext <= 3 && gainIdPrev < gainIdNext) {
0765           sample_values[gtx - 1] = 0;
0766           sample_value_errors[gtx - 1] = 1e+9;
0767           useless_sample_values[gtx - 1] = true;
0768         }
0769       }
0770     }
0771   };
0772 
0773   //#define ECAL_RECO_ALPAKA_TC_INIT_DEBUG
0774   class Kernel_time_computation_init {
0775     using ScalarType = ::ecal::multifit::SampleVector::Scalar;
0776 
0777   public:
0778     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0779     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0780                                   EcalDigiDeviceCollection::ConstView digisDevEB,
0781                                   EcalDigiDeviceCollection::ConstView digisDevEE,
0782                                   EcalMultifitConditionsDevice::ConstView conditionsDev,
0783                                   ScalarType* sample_values,
0784                                   ScalarType* sample_value_errors,
0785                                   ScalarType* ampMaxError,
0786                                   bool* useless_sample_values,
0787                                   char* pedestal_nums) const {
0788       // constants
0789       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0790 
0791       // indices
0792       auto const nchannelsEB = digisDevEB.size();
0793       auto const nchannels = nchannelsEB + digisDevEE.size();
0794       auto const offsetForInputs = nchannelsEB;
0795       auto const offsetForHashes = conditionsDev.offsetEE();
0796 
0797       auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0798 
0799       // configure shared mem
0800       auto* shrSampleValues = alpaka::getDynSharedMem<ScalarType>(acc);
0801       auto* shrSampleValueErrors = shrSampleValues + elemsPerBlock;
0802 
0803       for (auto txforward : cms::alpakatools::uniform_elements(acc, nchannels * nsamples)) {
0804         // go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution
0805         auto tx = nchannels * nsamples - 1 - txforward;
0806         auto const ch = tx / nsamples;
0807         auto const elemIdx = tx % elemsPerBlock;
0808 
0809         int const inputTx = ch >= offsetForInputs ? tx - offsetForInputs * nsamples : tx;
0810         int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
0811         auto const* digis = ch >= offsetForInputs ? digisDevEE.data()->data() : digisDevEB.data()->data();
0812         auto const* dids = ch >= offsetForInputs ? digisDevEE.id() : digisDevEB.id();
0813 
0814         // indices/inits
0815         auto const sample = tx % nsamples;
0816         auto const input_ch_start = inputCh * nsamples;
0817         ScalarType pedestal = 0.;
0818         int num = 0;
0819 
0820         // 0 and 1 sample values
0821         auto const adc0 = ecalMGPA::adc(digis[input_ch_start]);
0822         auto const gainId0 = ecalMGPA::gainId(digis[input_ch_start]);
0823         auto const adc1 = ecalMGPA::adc(digis[input_ch_start + 1]);
0824         auto const gainId1 = ecalMGPA::gainId(digis[input_ch_start + 1]);
0825         auto const did = DetId{dids[inputCh]};
0826         auto const isBarrel = did.subdetId() == EcalBarrel;
0827         auto const sample_mask = isBarrel ? conditionsDev.sampleMask_EB() : conditionsDev.sampleMask_EE();
0828         auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
0829                                        : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
0830 
0831         // set pedestal
0832         // TODO this branch is non-divergent for a group of 10 threads
0833         if (gainId0 == 1 && use_sample(sample_mask, 0)) {
0834           pedestal = static_cast<ScalarType>(adc0);
0835           num = 1;
0836 
0837           auto const diff = adc1 - adc0;
0838           if (gainId1 == 1 && use_sample(sample_mask, 1) &&
0839               std::abs(diff) < 3 * conditionsDev.pedestals_rms_x12()[hashedId]) {
0840             pedestal = (pedestal + static_cast<ScalarType>(adc1)) / 2.;
0841             num = 2;
0842           }
0843         } else {
0844           pedestal = conditionsDev.pedestals_mean_x12()[ch];
0845         }
0846 
0847         // ped subtracted and gain-renormalized samples.
0848         auto const gainId = ecalMGPA::gainId(digis[inputTx]);
0849         auto const adc = ecalMGPA::adc(digis[inputTx]);
0850 
0851         bool bad = false;
0852         ScalarType sample_value, sample_value_error;
0853         // TODO divergent branch
0854         // TODO: piece below is general both for amplitudes and timing
0855         // potentially there is a way to reduce the amount of code...
0856         if (!use_sample(sample_mask, sample)) {
0857           bad = true;
0858           sample_value = 0;
0859           sample_value_error = 0;
0860         } else if (gainId == 1) {
0861           sample_value = static_cast<ScalarType>(adc) - pedestal;
0862           sample_value_error = conditionsDev.pedestals_rms_x12()[hashedId];
0863         } else if (gainId == 2) {
0864           auto const mean_x6 = conditionsDev.pedestals_mean_x6()[hashedId];
0865           auto const rms_x6 = conditionsDev.pedestals_rms_x6()[hashedId];
0866           auto const gain12Over6 = conditionsDev.gain12Over6()[hashedId];
0867           sample_value = (static_cast<ScalarType>(adc) - mean_x6) * gain12Over6;
0868           sample_value_error = rms_x6 * gain12Over6;
0869         } else if (gainId == 3) {
0870           auto const mean_x1 = conditionsDev.pedestals_mean_x1()[hashedId];
0871           auto const rms_x1 = conditionsDev.pedestals_rms_x1()[hashedId];
0872           auto const gain12Over6 = conditionsDev.gain12Over6()[hashedId];
0873           auto const gain6Over1 = conditionsDev.gain6Over1()[hashedId];
0874           sample_value = (static_cast<ScalarType>(adc) - mean_x1) * gain6Over1 * gain12Over6;
0875           sample_value_error = rms_x1 * gain6Over1 * gain12Over6;
0876         } else {
0877           sample_value = 0;
0878           sample_value_error = 0;
0879           bad = true;
0880         }
0881 
0882         // TODO: make sure we save things correctly when sample is useless
0883         auto const useless_sample = (sample_value_error <= 0) | bad;
0884         useless_sample_values[tx] = useless_sample;
0885         sample_values[tx] = sample_value;
0886         sample_value_errors[tx] = useless_sample ? 1e+9 : sample_value_error;
0887 
0888         // DEBUG
0889 #ifdef ECAL_RECO_ALPAKA_TC_INIT_DEBUG
0890         if (ch == 0) {
0891           printf("sample = %d sample_value = %f sample_value_error = %f useless = %c\n",
0892                  sample,
0893                  sample_value,
0894                  sample_value_error,
0895                  useless_sample ? '1' : '0');
0896         }
0897 #endif
0898 
0899         // store into the shared mem
0900         shrSampleValues[elemIdx] = sample_value_error > 0 ? sample_value : std::numeric_limits<ScalarType>::min();
0901         shrSampleValueErrors[elemIdx] = sample_value_error;
0902         alpaka::syncBlockThreads(acc);
0903 
0904         // perform the reduction with min
0905         if (sample < 5) {
0906           // note, if equal -> we keep the value with lower sample as for cpu
0907           shrSampleValueErrors[elemIdx] = shrSampleValues[elemIdx] < shrSampleValues[elemIdx + 5]
0908                                               ? shrSampleValueErrors[elemIdx + 5]
0909                                               : shrSampleValueErrors[elemIdx];
0910           shrSampleValues[elemIdx] = std::max(shrSampleValues[elemIdx], shrSampleValues[elemIdx + 5]);
0911         }
0912         alpaka::syncBlockThreads(acc);
0913 
0914         // a bit of an overkill, but easier than to compare across 3 values
0915         if (sample < 3) {
0916           shrSampleValueErrors[elemIdx] = shrSampleValues[elemIdx] < shrSampleValues[elemIdx + 3]
0917                                               ? shrSampleValueErrors[elemIdx + 3]
0918                                               : shrSampleValueErrors[elemIdx];
0919           shrSampleValues[elemIdx] = std::max(shrSampleValues[elemIdx], shrSampleValues[elemIdx + 3]);
0920         }
0921         alpaka::syncBlockThreads(acc);
0922 
0923         if (sample < 2) {
0924           shrSampleValueErrors[elemIdx] = shrSampleValues[elemIdx] < shrSampleValues[elemIdx + 2]
0925                                               ? shrSampleValueErrors[elemIdx + 2]
0926                                               : shrSampleValueErrors[elemIdx];
0927           shrSampleValues[elemIdx] = std::max(shrSampleValues[elemIdx], shrSampleValues[elemIdx + 2]);
0928         }
0929         alpaka::syncBlockThreads(acc);
0930 
0931         if (sample == 0) {
0932           // we only need the max error
0933           auto const maxSampleValueError = shrSampleValues[elemIdx] < shrSampleValues[elemIdx + 1]
0934                                                ? shrSampleValueErrors[elemIdx + 1]
0935                                                : shrSampleValueErrors[elemIdx];
0936 
0937           // # pedestal samples used
0938           pedestal_nums[ch] = num;
0939           // this is used downstream
0940           ampMaxError[ch] = maxSampleValueError;
0941 
0942           // DEBUG
0943 #ifdef ECAL_RECO_ALPAKA_TC_INIT_DEBUG
0944           if (ch == 0) {
0945             printf("pedestal_nums = %d ampMaxError = %f\n", num, maxSampleValueError);
0946           }
0947 #endif
0948         }
0949       }
0950     }
0951   };
0952 
0953   ///
0954   /// launch context parameters: 1 thread per channel
0955   ///
0956   //#define DEBUG_TIME_CORRECTION
0957   class Kernel_time_correction_and_finalize {
0958     using ScalarType = ::ecal::multifit::SampleVector::Scalar;
0959 
0960   public:
0961     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0962     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0963                                   EcalDigiDeviceCollection::ConstView digisDevEB,
0964                                   EcalDigiDeviceCollection::ConstView digisDevEE,
0965                                   EcalUncalibratedRecHitDeviceCollection::View uncalibRecHitsEB,
0966                                   EcalUncalibratedRecHitDeviceCollection::View uncalibRecHitsEE,
0967                                   EcalMultifitConditionsDevice::ConstView conditionsDev,
0968                                   ScalarType* const g_timeMax,
0969                                   ScalarType* const g_timeError,
0970                                   ConfigurationParameters::type const timeConstantTermEB,
0971                                   ConfigurationParameters::type const timeConstantTermEE,
0972                                   ConfigurationParameters::type const timeNconstEB,
0973                                   ConfigurationParameters::type const timeNconstEE,
0974                                   ConfigurationParameters::type const amplitudeThresholdEB,
0975                                   ConfigurationParameters::type const amplitudeThresholdEE,
0976                                   ConfigurationParameters::type const outOfTimeThreshG12pEB,
0977                                   ConfigurationParameters::type const outOfTimeThreshG12pEE,
0978                                   ConfigurationParameters::type const outOfTimeThreshG12mEB,
0979                                   ConfigurationParameters::type const outOfTimeThreshG12mEE,
0980                                   ConfigurationParameters::type const outOfTimeThreshG61pEB,
0981                                   ConfigurationParameters::type const outOfTimeThreshG61pEE,
0982                                   ConfigurationParameters::type const outOfTimeThreshG61mEB,
0983                                   ConfigurationParameters::type const outOfTimeThreshG61mEE) const {
0984       // constants
0985       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0986       auto const nchannelsEB = digisDevEB.size();
0987       auto const nchannels = nchannelsEB + digisDevEE.size();
0988       auto const offsetForInputs = nchannelsEB;
0989       auto const offsetForHashes = conditionsDev.offsetEE();
0990 
0991       for (auto gtx : cms::alpakatools::uniform_elements(acc, nchannels)) {
0992         const int inputGtx = gtx >= offsetForInputs ? gtx - offsetForInputs : gtx;
0993         auto const* dids = gtx >= offsetForInputs ? digisDevEE.id() : digisDevEB.id();
0994         auto const* digis = gtx >= offsetForInputs ? digisDevEE.data()->data() : digisDevEB.data()->data();
0995 
0996         auto* g_amplitude = gtx >= nchannelsEB ? uncalibRecHitsEE.amplitude() : uncalibRecHitsEB.amplitude();
0997         auto* g_jitter = gtx >= nchannelsEB ? uncalibRecHitsEE.jitter() : uncalibRecHitsEB.jitter();
0998         auto* g_jitterError = gtx >= nchannelsEB ? uncalibRecHitsEE.jitterError() : uncalibRecHitsEB.jitterError();
0999         auto* flags = gtx >= nchannelsEB ? uncalibRecHitsEE.flags() : uncalibRecHitsEB.flags();
1000 
1001         auto const did = DetId{dids[inputGtx]};
1002         auto const isBarrel = did.subdetId() == EcalBarrel;
1003         auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
1004                                        : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
1005         // need to access the underlying data directly here because the std::arrays have different size for EB and EE, which is not compatible with the ? operator
1006         auto* const amplitudeBins = isBarrel ? conditionsDev.timeBiasCorrections_amplitude_EB().data()
1007                                              : conditionsDev.timeBiasCorrections_amplitude_EE().data();
1008         auto* const shiftBins = isBarrel ? conditionsDev.timeBiasCorrections_shift_EB().data()
1009                                          : conditionsDev.timeBiasCorrections_shift_EE().data();
1010         auto const amplitudeBinsSize =
1011             isBarrel ? conditionsDev.timeBiasCorrectionSizeEB() : conditionsDev.timeBiasCorrectionSizeEE();
1012         auto const timeConstantTerm = isBarrel ? timeConstantTermEB : timeConstantTermEE;
1013         auto const timeNconst = isBarrel ? timeNconstEB : timeNconstEE;
1014         auto const offsetTimeValue = isBarrel ? conditionsDev.timeOffset_EB() : conditionsDev.timeOffset_EE();
1015         auto const amplitudeThreshold = isBarrel ? amplitudeThresholdEB : amplitudeThresholdEE;
1016         auto const outOfTimeThreshG12p = isBarrel ? outOfTimeThreshG12pEB : outOfTimeThreshG12pEE;
1017         auto const outOfTimeThreshG12m = isBarrel ? outOfTimeThreshG12mEB : outOfTimeThreshG12mEE;
1018         auto const outOfTimeThreshG61p = isBarrel ? outOfTimeThreshG61pEB : outOfTimeThreshG61pEE;
1019         auto const outOfTimeThreshG61m = isBarrel ? outOfTimeThreshG61mEB : outOfTimeThreshG61mEE;
1020 
1021         // load some
1022         auto const amplitude = g_amplitude[inputGtx];
1023         auto const rms_x12 = conditionsDev.pedestals_rms_x12()[hashedId];
1024         auto const timeCalibConst = conditionsDev.timeCalibConstants()[hashedId];
1025 
1026         int myBin = -1;
1027         for (size_t bin = 0; bin < amplitudeBinsSize; ++bin) {
1028           if (amplitude > amplitudeBins[bin])
1029             myBin = bin;
1030           else
1031             break;
1032         }
1033 
1034         ScalarType correction = 0;
1035         if (myBin == -1) {
1036           correction = shiftBins[0];
1037         } else if (myBin == static_cast<int>(amplitudeBinsSize) - 1) {
1038           correction = shiftBins[myBin];
1039         } else {
1040           correction = shiftBins[myBin + 1] - shiftBins[myBin];
1041           correction *= (amplitude - amplitudeBins[myBin]) / (amplitudeBins[myBin + 1] - amplitudeBins[myBin]);
1042           correction += shiftBins[myBin];
1043         }
1044 
1045         // correction * 1./25.
1046         correction = correction * 0.04;
1047         auto const timeMax = g_timeMax[gtx];
1048         auto const timeError = g_timeError[gtx];
1049         auto const jitter = timeMax - 5 + correction;
1050         auto const jitterError =
1051             std::sqrt(timeError * timeError + timeConstantTerm * timeConstantTerm * 0.04 * 0.04);  // 0.04 = 1./25.
1052 
1053 #ifdef DEBUG_TIME_CORRECTION
1054         printf("ch = %d timeMax = %f timeError = %f jitter = %f correction = %f\n",
1055                gtx,
1056                timeMax,
1057                timeError,
1058                jitter,
1059                correction);
1060 #endif
1061 
1062         // store back to  global
1063         g_jitter[inputGtx] = jitter;
1064         g_jitterError[inputGtx] = jitterError;
1065 
1066         // set the flag
1067         // TODO: replace with something more efficient (if required),
1068         // for now just to make it work
1069         if (amplitude > amplitudeThreshold * rms_x12) {
1070           auto threshP = outOfTimeThreshG12p;
1071           auto threshM = outOfTimeThreshG12m;
1072           if (amplitude > 3000.) {
1073             for (int isample = 0; isample < nsamples; isample++) {
1074               auto const gainid = ecalMGPA::gainId(digis[nsamples * inputGtx + isample]);
1075               if (gainid != 1) {
1076                 threshP = outOfTimeThreshG61p;
1077                 threshM = outOfTimeThreshG61m;
1078                 break;
1079               }
1080             }
1081           }
1082 
1083           auto const correctedTime = (timeMax - 5) * 25 + timeCalibConst + offsetTimeValue;
1084           auto const nterm = timeNconst * rms_x12 / amplitude;
1085           auto const sigmat = std::sqrt(nterm * nterm + timeConstantTerm * timeConstantTerm);
1086           if (correctedTime > sigmat * threshP || correctedTime < -sigmat * threshM)
1087             flags[inputGtx] |= 0x1 << EcalUncalibratedRecHit::kOutOfTime;
1088         }
1089       }
1090     }
1091   };
1092 
1093 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit
1094 
1095 namespace alpaka::trait {
1096   using namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit;
1097 
1098   //! The trait for getting the size of the block shared dynamic memory for Kernel_time_compute_nullhypot.
1099   template <typename TAcc>
1100   struct BlockSharedMemDynSizeBytes<Kernel_time_compute_nullhypot, TAcc> {
1101     //! \return The size of the shared memory allocated for a block.
1102     template <typename TVec, typename... TArgs>
1103     ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_time_compute_nullhypot const&,
1104                                                                  TVec const& threadsPerBlock,
1105                                                                  TVec const& elemsPerThread,
1106                                                                  TArgs const&...) -> std::size_t {
1107       using ScalarType = ecal::multifit::SampleVector::Scalar;
1108 
1109       // return the amount of dynamic shared memory needed
1110       std::size_t bytes = threadsPerBlock[0u] * elemsPerThread[0u] * 4 * sizeof(ScalarType);
1111       return bytes;
1112     }
1113   };
1114 
1115   //! The trait for getting the size of the block shared dynamic memory for Kernel_time_compute_makeratio.
1116   template <typename TAcc>
1117   struct BlockSharedMemDynSizeBytes<Kernel_time_compute_makeratio, TAcc> {
1118     template <typename TVec, typename... TArgs>
1119     ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_time_compute_makeratio const&,
1120                                                                  TVec const& threadsPerBlock,
1121                                                                  TVec const& elemsPerThread,
1122                                                                  TArgs const&...) -> std::size_t {
1123       using ScalarType = ecal::multifit::SampleVector::Scalar;
1124 
1125       std::size_t bytes = (8 * sizeof(ScalarType) + 3 * sizeof(bool)) * threadsPerBlock[0u] * elemsPerThread[0u];
1126       return bytes;
1127     }
1128   };
1129 
1130   //! The trait for getting the size of the block shared dynamic memory for Kernel_time_compute_findamplchi2_and_finish.
1131   template <typename TAcc>
1132   struct BlockSharedMemDynSizeBytes<Kernel_time_compute_findamplchi2_and_finish, TAcc> {
1133     template <typename TVec, typename... TArgs>
1134     ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_time_compute_findamplchi2_and_finish const&,
1135                                                                  TVec const& threadsPerBlock,
1136                                                                  TVec const& elemsPerThread,
1137                                                                  TArgs const&...) -> std::size_t {
1138       using ScalarType = ecal::multifit::SampleVector::Scalar;
1139 
1140       std::size_t bytes = 2 * threadsPerBlock[0u] * elemsPerThread[0u] * sizeof(ScalarType);
1141       return bytes;
1142     }
1143   };
1144 
1145   //! The trait for getting the size of the block shared dynamic memory for Kernel_time_computation_init.
1146   template <typename TAcc>
1147   struct BlockSharedMemDynSizeBytes<Kernel_time_computation_init, TAcc> {
1148     template <typename TVec, typename... TArgs>
1149     ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_time_computation_init const&,
1150                                                                  TVec const& threadsPerBlock,
1151                                                                  TVec const& elemsPerThread,
1152                                                                  TArgs const&...) -> std::size_t {
1153       using ScalarType = ecal::multifit::SampleVector::Scalar;
1154 
1155       std::size_t bytes = 2 * threadsPerBlock[0u] * elemsPerThread[0u] * sizeof(ScalarType);
1156       return bytes;
1157     }
1158   };
1159 
1160 }  // namespace alpaka::trait
1161 
1162 #endif  // RecoLocalCalo_EcalRecProducers_plugins_TimeComputationKernels_h