Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-23 22:56:27

0001 #ifndef RecoLocalCalo_EcalRecProducers_plugins_alpaka_TimeComputationKernels_h
0002 #define RecoLocalCalo_EcalRecProducers_plugins_alpaka_TimeComputationKernels_h
0003 
0004 #include <cassert>
0005 #include <cmath>
0006 #include <limits>
0007 
0008 #include "CondFormats/EcalObjects/interface/alpaka/EcalMultifitConditionsDevice.h"
0009 #include "CondFormats/EcalObjects/interface/alpaka/EcalMultifitParametersDevice.h"
0010 #include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
0011 #include "DataFormats/EcalDigi/interface/EcalMGPASample.h"
0012 #include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
0013 #include "DataFormats/Math/interface/approx_exp.h"
0014 #include "DataFormats/Math/interface/approx_log.h"
0015 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0016 #include "RecoLocalCalo/EcalRecProducers/interface/EigenMatrixTypes_gpu.h"
0017 
0018 #include "DeclsForKernels.h"
0019 #include "KernelHelpers.h"
0020 
0021 //#define ECAL_RECO_ALPAKA_DEBUG
0022 
0023 namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {
0024 
0025   ALPAKA_FN_ACC ALPAKA_FN_INLINE bool use_sample(unsigned int sample_mask, unsigned int sample) {
0026     return sample_mask & (0x1 << (EcalDataFrame::MAXSAMPLES - (sample + 1)));
0027   }
0028 
0029   ALPAKA_FN_ACC constexpr float fast_expf(float x) { return unsafe_expf<6>(x); }
0030   ALPAKA_FN_ACC constexpr float fast_logf(float x) { return unsafe_logf<7>(x); }
0031 
0032   class Kernel_time_compute_nullhypot {
0033     using ScalarType = ::ecal::multifit::SampleVector::Scalar;
0034 
0035   public:
0036     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0037     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0038                                   ScalarType* const sample_values,
0039                                   ScalarType* const sample_value_errors,
0040                                   bool* const useless_sample_values,
0041                                   ScalarType* chi2s,
0042                                   ScalarType* sum0s,
0043                                   ScalarType* sumAAs,
0044                                   uint32_t const nchannels) const {
0045       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0046 
0047       // indices
0048       auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0049 
0050       // shared mem inits
0051       auto* s_sum0 = alpaka::getDynSharedMem<char>(acc);
0052       auto* s_sum1 = reinterpret_cast<ScalarType*>(s_sum0 + elemsPerBlock);
0053       auto* s_sumA = s_sum1 + elemsPerBlock;
0054       auto* s_sumAA = s_sumA + elemsPerBlock;
0055 
0056       for (auto txforward : cms::alpakatools::uniform_elements(acc, nchannels * nsamples)) {
0057         // go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution
0058         auto tx = nchannels * nsamples - 1 - txforward;
0059         auto const ch = tx / nsamples;
0060 
0061         auto const sample = tx % nsamples;
0062         auto const ltx = tx % elemsPerBlock;
0063 
0064         // TODO make sure no div by 0
0065         auto const inv_error =
0066             useless_sample_values[tx] ? 0. : 1. / (sample_value_errors[tx] * sample_value_errors[tx]);
0067         auto const sample_value = sample_values[tx];
0068         s_sum0[ltx] = useless_sample_values[tx] ? 0 : 1;
0069         s_sum1[ltx] = inv_error;
0070         s_sumA[ltx] = sample_value * inv_error;
0071         s_sumAA[ltx] = sample_value * sample_value * inv_error;
0072         alpaka::syncBlockThreads(acc);
0073 
0074         // 5 threads for [0, 4] samples
0075         if (sample < 5) {
0076           s_sum0[ltx] += s_sum0[ltx + 5];
0077           s_sum1[ltx] += s_sum1[ltx + 5];
0078           s_sumA[ltx] += s_sumA[ltx + 5];
0079           s_sumAA[ltx] += s_sumAA[ltx + 5];
0080         }
0081         alpaka::syncBlockThreads(acc);
0082 
0083         if (sample < 2) {
0084           // note double counting of sample 3
0085           s_sum0[ltx] += s_sum0[ltx + 2] + s_sum0[ltx + 3];
0086           s_sum1[ltx] += s_sum1[ltx + 2] + s_sum1[ltx + 3];
0087           s_sumA[ltx] += s_sumA[ltx + 2] + s_sumA[ltx + 3];
0088           s_sumAA[ltx] += s_sumAA[ltx + 2] + s_sumAA[ltx + 3];
0089         }
0090         alpaka::syncBlockThreads(acc);
0091 
0092         if (sample == 0) {
0093           // note, subtract to remove the double counting of sample == 3
0094           auto const sum0 = s_sum0[ltx] + s_sum0[ltx + 1] - s_sum0[ltx + 3];
0095           auto const sum1 = s_sum1[ltx] + s_sum1[ltx + 1] - s_sum1[ltx + 3];
0096           auto const sumA = s_sumA[ltx] + s_sumA[ltx + 1] - s_sumA[ltx + 3];
0097           auto const sumAA = s_sumAA[ltx] + s_sumAA[ltx + 1] - s_sumAA[ltx + 3];
0098           auto const chi2 = sum0 > 0 ? (sumAA - sumA * sumA / sum1) / sum0 : static_cast<ScalarType>(0);
0099           chi2s[ch] = chi2;
0100           sum0s[ch] = sum0;
0101           sumAAs[ch] = sumAA;
0102 
0103 #ifdef DEBUG_TC_NULLHYPOT
0104           if (ch == 0) {
0105             printf("chi2 = %f sum0 = %d sumAA = %f\n", chi2, static_cast<int>(sum0), sumAA);
0106           }
0107 #endif
0108         }
0109       }
0110     }
0111   };
0112 
0113   //
0114   // launch ctx parameters are
0115   // 45 threads per channel, X channels per block, Y blocks
0116   // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9
0117   // TODO: it might be much beter to use 32 threads per channel instead of 45
0118   // to simplify the synchronization
0119   class Kernel_time_compute_makeratio {
0120     using ScalarType = ::ecal::multifit::SampleVector::Scalar;
0121 
0122   public:
0123     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0124     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0125                                   EcalDigiDeviceCollection::ConstView digisDevEB,
0126                                   EcalDigiDeviceCollection::ConstView digisDevEE,
0127                                   ScalarType* const sample_values,
0128                                   ScalarType* const sample_value_errors,
0129                                   bool* const useless_sample_values,
0130                                   char* const pedestal_nums,
0131                                   ScalarType* const sumAAsNullHypot,
0132                                   ScalarType* const sum0sNullHypot,
0133                                   ScalarType* tMaxAlphaBetas,
0134                                   ScalarType* tMaxErrorAlphaBetas,
0135                                   ScalarType* g_accTimeMax,
0136                                   ScalarType* g_accTimeWgt,
0137                                   TimeComputationState* g_state,
0138                                   EcalMultifitParametersDevice::ConstView paramsDev,
0139                                   ConfigurationParameters::type const timeFitLimits_firstEB,
0140                                   ConfigurationParameters::type const timeFitLimits_firstEE,
0141                                   ConfigurationParameters::type const timeFitLimits_secondEB,
0142                                   ConfigurationParameters::type const timeFitLimits_secondEE) const {
0143       // constants
0144       constexpr uint32_t nchannels_per_block = 10;
0145       constexpr auto nthreads_per_channel = nchannels_per_block * (nchannels_per_block - 1) / 2;
0146       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0147       auto const nchannels = digisDevEB.size() + digisDevEE.size();
0148       auto const offsetForInputs = digisDevEB.size();
0149       auto const totalElements = nthreads_per_channel * nchannels;
0150 
0151       auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0152       ALPAKA_ASSERT_ACC(nthreads_per_channel * nchannels_per_block == elemsPerBlock);
0153 
0154       auto* shr_chi2s = alpaka::getDynSharedMem<ScalarType>(acc);
0155       auto* shr_time_wgt = shr_chi2s + elemsPerBlock;
0156       auto* shr_time_max = shr_time_wgt + elemsPerBlock;
0157       auto* shrTimeMax = shr_time_max + elemsPerBlock;
0158       auto* shrTimeWgt = shrTimeMax + elemsPerBlock;
0159       auto* shr_chi2 = shrTimeWgt + elemsPerBlock;
0160       auto* shr_tmax = shr_chi2 + elemsPerBlock;
0161       auto* shr_tmaxerr = shr_tmax + elemsPerBlock;
0162       auto* shr_condForUselessSamples = reinterpret_cast<bool*>(shr_tmaxerr + elemsPerBlock);
0163       auto* shr_internalCondForSkipping1 = shr_condForUselessSamples + elemsPerBlock;
0164       auto* shr_internalCondForSkipping2 = shr_internalCondForSkipping1 + elemsPerBlock;
0165 
0166       for (auto block : cms::alpakatools::uniform_groups(acc, totalElements)) {
0167         for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0168           auto const ch = idx.global / nthreads_per_channel;
0169           auto const ltx = idx.global % nthreads_per_channel;
0170 
0171           auto const ch_start = ch * nsamples;
0172           auto const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
0173           auto const* dids = ch >= offsetForInputs ? digisDevEE.id() : digisDevEB.id();
0174 
0175           auto const did = DetId{dids[inputCh]};
0176           auto const isBarrel = did.subdetId() == EcalBarrel;
0177           auto* const amplitudeFitParameters =
0178               isBarrel ? paramsDev.amplitudeFitParamsEB().data() : paramsDev.amplitudeFitParamsEE().data();
0179           auto* const timeFitParameters =
0180               isBarrel ? paramsDev.timeFitParamsEB().data() : paramsDev.timeFitParamsEE().data();
0181           auto const timeFitParameters_size =
0182               isBarrel ? paramsDev.timeFitParamsEB().size() : paramsDev.timeFitParamsEE().size();
0183           auto const timeFitLimits_first = isBarrel ? timeFitLimits_firstEB : timeFitLimits_firstEE;
0184           auto const timeFitLimits_second = isBarrel ? timeFitLimits_secondEB : timeFitLimits_secondEE;
0185 
0186           // map tx -> (sample_i, sample_j)
0187           int sample_i = 0;
0188           int sample_j = 0;
0189           if (ltx <= 8) {
0190             sample_i = 0;
0191             sample_j = 1 + ltx;
0192           } else if (ltx <= 16) {
0193             sample_i = 1;
0194             sample_j = 2 + ltx - 9;
0195           } else if (ltx <= 23) {
0196             sample_i = 2;
0197             sample_j = 3 + ltx - 17;
0198           } else if (ltx <= 29) {
0199             sample_i = 3;
0200             sample_j = 4 + ltx - 24;
0201           } else if (ltx <= 34) {
0202             sample_i = 4;
0203             sample_j = 5 + ltx - 30;
0204           } else if (ltx <= 38) {
0205             sample_i = 5;
0206             sample_j = 6 + ltx - 35;
0207           } else if (ltx <= 41) {
0208             sample_i = 6;
0209             sample_j = 7 + ltx - 39;
0210           } else if (ltx <= 43) {
0211             sample_i = 7;
0212             sample_j = 8 + ltx - 42;
0213           } else if (ltx <= 44) {
0214             sample_i = 8;
0215             sample_j = 9;
0216           } else {
0217             // FIXME this needs a more portable solution, that wraps abort() / __trap() / throw depending on the back-end
0218             ALPAKA_ASSERT_ACC(false);
0219           }
0220 
0221           auto const tx_i = ch_start + sample_i;
0222           auto const tx_j = ch_start + sample_j;
0223 
0224           //
0225           // note, given the way we partition the block, with 45 threads per channel
0226           // we will end up with inactive threads which need to be dragged along
0227           // through the synching point
0228           //
0229           bool const condForUselessSamples = useless_sample_values[tx_i] || useless_sample_values[tx_j] ||
0230                                              sample_values[tx_i] <= 1 || sample_values[tx_j] <= 1;
0231 
0232           //
0233           // see cpu implementation for explanation
0234           //
0235           ScalarType chi2 = std::numeric_limits<ScalarType>::max();
0236           ScalarType tmax = 0;
0237           ScalarType tmaxerr = 0;
0238           shrTimeMax[idx.local] = 0;
0239           shrTimeWgt[idx.local] = 0;
0240 
0241           bool internalCondForSkipping1 = true;
0242           bool internalCondForSkipping2 = true;
0243           if (!condForUselessSamples) {
0244             auto const rtmp = sample_values[tx_i] / sample_values[tx_j];
0245             auto const invampl_i = 1. / sample_values[tx_i];
0246             auto const relErr2_i = sample_value_errors[tx_i] * sample_value_errors[tx_i] * invampl_i * invampl_i;
0247             auto const invampl_j = 1. / sample_values[tx_j];
0248             auto const relErr2_j = sample_value_errors[tx_j] * sample_value_errors[tx_j] * invampl_j * invampl_j;
0249             auto const err1 = rtmp * rtmp * (relErr2_i + relErr2_j);
0250             auto err2 =
0251                 sample_value_errors[tx_j] * (sample_values[tx_i] - sample_values[tx_j]) * (invampl_j * invampl_j);
0252             // TODO non-divergent branch for a block if each block has 1 channel
0253             // otherwise non-divergent for groups of 45 threads
0254             // at this point, pedestal_nums[ch] can be either 0, 1 or 2
0255             if (pedestal_nums[ch] == 2)
0256               err2 *= err2 * 0.5;
0257             auto const err3 = (0.289 * 0.289) * (invampl_j * invampl_j);
0258             auto const total_error = std::sqrt(err1 + err2 + err3);
0259 
0260             auto const alpha = amplitudeFitParameters[0];
0261             auto const beta = amplitudeFitParameters[1];
0262             auto const alphabeta = alpha * beta;
0263             auto const invalphabeta = 1. / alphabeta;
0264 
0265             // variables instead of a struct
0266             auto const ratio_index = sample_i;
0267             auto const ratio_step = sample_j - sample_i;
0268             auto const ratio_value = rtmp;
0269             auto const ratio_error = total_error;
0270 
0271             auto const rlim_i_j = fast_expf(static_cast<ScalarType>(sample_j - sample_i) / beta) - 0.001;
0272             internalCondForSkipping1 = !(total_error < 1. && rtmp > 0.001 && rtmp < rlim_i_j);
0273             if (!internalCondForSkipping1) {
0274               //
0275               // precompute.
0276               // in cpu version this was done conditionally
0277               // however easier to do it here (precompute) and then just filter out
0278               // if not needed
0279               //
0280               auto const l_timeFitLimits_first = timeFitLimits_first;
0281               auto const l_timeFitLimits_second = timeFitLimits_second;
0282               if (ratio_step == 1 && ratio_value >= l_timeFitLimits_first && ratio_value <= l_timeFitLimits_second) {
0283                 auto const time_max_i = static_cast<ScalarType>(ratio_index);
0284                 auto u = timeFitParameters[timeFitParameters_size - 1];
0285                 CMS_UNROLL_LOOP
0286                 for (int k = timeFitParameters_size - 2; k >= 0; --k)
0287                   u = u * ratio_value + timeFitParameters[k];
0288 
0289                 auto du = (timeFitParameters_size - 1) * (timeFitParameters[timeFitParameters_size - 1]);
0290                 for (int k = timeFitParameters_size - 2; k >= 1; --k)
0291                   du = du * ratio_value + k * timeFitParameters[k];
0292 
0293                 auto const error2 = ratio_error * ratio_error * du * du;
0294                 auto const time_max = error2 > 0 ? (time_max_i - u) / error2 : static_cast<ScalarType>(0);
0295                 auto const time_wgt = error2 > 0 ? 1. / error2 : static_cast<ScalarType>(0);
0296 
0297                 // store into shared mem
0298                 // note, this name is essentially identical to the one used
0299                 // below.
0300                 shrTimeMax[idx.local] = error2 > 0 ? time_max : 0;
0301                 shrTimeWgt[idx.local] = error2 > 0 ? time_wgt : 0;
0302               } else {
0303                 shrTimeMax[idx.local] = 0;
0304                 shrTimeWgt[idx.local] = 0;
0305               }
0306 
0307               // continue with ratios
0308               auto const stepOverBeta = static_cast<ScalarType>(ratio_step) / beta;
0309               auto const offset = static_cast<ScalarType>(ratio_index) + alphabeta;
0310               auto const rmin = std::max(ratio_value - ratio_error, 0.001);
0311               auto const rmax =
0312                   std::min(ratio_value + ratio_error, fast_expf(static_cast<ScalarType>(ratio_step) / beta) - 0.001);
0313               auto const time1 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmin)) / alpha) - 1.);
0314               auto const time2 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmax)) / alpha) - 1.);
0315 
0316               // set these guys
0317               tmax = 0.5 * (time1 + time2);
0318               tmaxerr = 0.5 * std::sqrt((time1 - time2) * (time1 - time2));
0319 #ifdef DEBUG_TC_MAKERATIO
0320               if (ch == 1 || ch == 0)
0321                 printf(
0322                     "ch = %d ltx = %d tmax = %f tmaxerr = %f time1 = %f time2 = %f offset = %f rmin = %f rmax = "
0323                     "%f\n",
0324                     ch,
0325                     ltx,
0326                     tmax,
0327                     tmaxerr,
0328                     time1,
0329                     time2,
0330                     offset,
0331                     rmin,
0332                     rmax);
0333 #endif
0334 
0335               ScalarType sumAf = 0;
0336               ScalarType sumff = 0;
0337               const int itmin = std::max(-1, static_cast<int>(std::floor(tmax - alphabeta)));
0338               auto loffset = (static_cast<ScalarType>(itmin) - tmax) * invalphabeta;
0339               // TODO: data dependence
0340               for (int it = itmin + 1; it < nsamples; ++it) {
0341                 loffset += invalphabeta;
0342                 if (useless_sample_values[ch_start + it])
0343                   continue;
0344                 auto const inverr2 = 1. / (sample_value_errors[ch_start + it] * sample_value_errors[ch_start + it]);
0345                 auto const term1 = 1. + loffset;
0346                 auto const f = (term1 > 1e-6) ? fast_expf(alpha * (fast_logf(term1) - loffset)) : 0;
0347                 sumAf += sample_values[ch_start + it] * (f * inverr2);
0348                 sumff += f * (f * inverr2);
0349               }
0350 
0351               auto const sumAA = sumAAsNullHypot[ch];
0352               auto const sum0 = sum0sNullHypot[ch];
0353               chi2 = sumAA;
0354               // TODO: sum0 can not be 0 below, need to introduce the check upfront
0355               if (sumff > 0) {
0356                 chi2 = sumAA - sumAf * (sumAf / sumff);
0357               }
0358               chi2 /= sum0;
0359 
0360 #ifdef DEBUG_TC_MAKERATIO
0361               if (ch == 1 || ch == 0)
0362                 printf(
0363                     "ch = %d ltx = %d sumAf = %f sumff = %f sumAA = %f sum0 = %d tmax = %f tmaxerr = %f chi2 = "
0364                     "%f\n",
0365                     ch,
0366                     ltx,
0367                     sumAf,
0368                     sumff,
0369                     sumAA,
0370                     static_cast<int>(sum0),
0371                     tmax,
0372                     tmaxerr,
0373                     chi2);
0374 #endif
0375 
0376               if (chi2 > 0 && tmax > 0 && tmaxerr > 0)
0377                 internalCondForSkipping2 = false;
0378               else
0379                 chi2 = std::numeric_limits<ScalarType>::max();
0380             }
0381           }
0382 
0383           // store into smem
0384           shr_chi2s[idx.local] = chi2;
0385           shr_chi2[idx.local] = chi2;
0386           shr_tmax[idx.local] = tmax;
0387           shr_tmaxerr[idx.local] = tmaxerr;
0388           shr_condForUselessSamples[idx.local] = condForUselessSamples;
0389           shr_internalCondForSkipping1[idx.local] = internalCondForSkipping1;
0390           shr_internalCondForSkipping2[idx.local] = internalCondForSkipping2;
0391         }
0392 
0393         alpaka::syncBlockThreads(acc);
0394 
0395         // find min chi2 - quite crude for now
0396         // TODO validate/check
0397         auto iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
0398         bool oddElements = nthreads_per_channel % 2;
0399         CMS_UNROLL_LOOP
0400         while (iter >= 1) {
0401           for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0402             auto const ltx = idx.global % nthreads_per_channel;
0403 
0404             if (ltx < iter && !(oddElements && (ltx == iter - 1 && ltx > 0))) {
0405               // for odd ns, the last guy will just store itself
0406               // exception is for ltx == 0 and iter==1
0407               shr_chi2s[idx.local] = std::min(shr_chi2s[idx.local], shr_chi2s[idx.local + iter]);
0408             }
0409           }
0410           alpaka::syncBlockThreads(acc);
0411 
0412           oddElements = iter % 2;
0413           iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
0414         }
0415 
0416         for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0417           auto const ltx = idx.global % nthreads_per_channel;
0418 
0419           // get precomputedflags for this element from shared memory
0420           auto const condForUselessSamples = shr_condForUselessSamples[idx.local];
0421           auto const internalCondForSkipping1 = shr_internalCondForSkipping1[idx.local];
0422           auto const internalCondForSkipping2 = shr_internalCondForSkipping2[idx.local];
0423           // filter out inactive or useless samples threads
0424           if (!condForUselessSamples && !internalCondForSkipping1 && !internalCondForSkipping2) {
0425             // min chi2, now compute weighted average of tmax measurements
0426             // see cpu version for more explanation
0427             auto const chi2 = shr_chi2[idx.local];
0428             auto const chi2min = shr_chi2s[idx.local - ltx];
0429             auto const chi2Limit = chi2min + 1.;
0430             auto const tmaxerr = shr_tmaxerr[idx.local];
0431             auto const inverseSigmaSquared = chi2 < chi2Limit ? 1. / (tmaxerr * tmaxerr) : 0.;
0432 
0433 #ifdef DEBUG_TC_MAKERATIO
0434             if (ch == 1 || ch == 0) {
0435               auto const ch = idx.global / nthreads_per_channel;
0436               printf("ch = %d ltx = %d chi2min = %f chi2Limit = %f inverseSigmaSquared = %f\n",
0437                      ch,
0438                      ltx,
0439                      chi2min,
0440                      chi2Limit,
0441                      inverseSigmaSquared);
0442             }
0443 #endif
0444 
0445             // store into shared mem and run reduction
0446             // TODO: check if cooperative groups would be better
0447             // TODO: check if shuffling intrinsics are better
0448             auto const tmax = shr_tmax[idx.local];
0449             shr_time_wgt[idx.local] = inverseSigmaSquared;
0450             shr_time_max[idx.local] = tmax * inverseSigmaSquared;
0451           } else {
0452             shr_time_wgt[idx.local] = 0;
0453             shr_time_max[idx.local] = 0;
0454           }
0455         }
0456 
0457         alpaka::syncBlockThreads(acc);
0458 
0459         // reduce to compute time_max and time_wgt
0460         iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
0461         oddElements = nthreads_per_channel % 2;
0462         CMS_UNROLL_LOOP
0463         while (iter >= 1) {
0464           for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0465             auto const ltx = idx.global % nthreads_per_channel;
0466 
0467             if (ltx < iter && !(oddElements && (ltx == iter - 1 && ltx > 0))) {
0468               shr_time_wgt[idx.local] += shr_time_wgt[idx.local + iter];
0469               shr_time_max[idx.local] += shr_time_max[idx.local + iter];
0470               shrTimeMax[idx.local] += shrTimeMax[idx.local + iter];
0471               shrTimeWgt[idx.local] += shrTimeWgt[idx.local + iter];
0472             }
0473           }
0474 
0475           alpaka::syncBlockThreads(acc);
0476           oddElements = iter % 2;
0477           iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
0478         }
0479 
0480         for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0481           auto const ltx = idx.global % nthreads_per_channel;
0482 
0483           // load from shared memory the 0th guy (will contain accumulated values)
0484           // compute
0485           // store into global mem
0486           if (ltx == 0) {
0487             auto const ch = idx.global / nthreads_per_channel;
0488             auto const tmp_time_max = shr_time_max[idx.local];
0489             auto const tmp_time_wgt = shr_time_wgt[idx.local];
0490 
0491             // we are done if there number of time ratios is 0
0492             if (tmp_time_wgt == 0 && tmp_time_max == 0) {
0493               g_state[ch] = TimeComputationState::Finished;
0494               continue;
0495             }
0496 
0497             // no div by 0
0498             auto const tMaxAlphaBeta = tmp_time_max / tmp_time_wgt;
0499             auto const tMaxErrorAlphaBeta = 1. / std::sqrt(tmp_time_wgt);
0500 
0501             tMaxAlphaBetas[ch] = tMaxAlphaBeta;
0502             tMaxErrorAlphaBetas[ch] = tMaxErrorAlphaBeta;
0503             g_accTimeMax[ch] = shrTimeMax[idx.local];
0504             g_accTimeWgt[ch] = shrTimeWgt[idx.local];
0505             g_state[ch] = TimeComputationState::NotFinished;
0506 
0507 #ifdef DEBUG_TC_MAKERATIO
0508             printf("ch = %d time_max = %f time_wgt = %f\n", ch, tmp_time_max, tmp_time_wgt);
0509             printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f timeMax = %f timeWgt = %f\n",
0510                    ch,
0511                    tMaxAlphaBeta,
0512                    tMaxErrorAlphaBeta,
0513                    shrTimeMax[idx.local],
0514                    shrTimeWgt[idx.local]);
0515 #endif
0516           }
0517         }
0518       }
0519     }
0520   };
0521 
0522   class Kernel_time_compute_findamplchi2_and_finish {
0523     using ScalarType = ::ecal::multifit::SampleVector::Scalar;
0524 
0525   public:
0526     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0527     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0528                                   EcalDigiDeviceCollection::ConstView digisDevEB,
0529                                   EcalDigiDeviceCollection::ConstView digisDevEE,
0530                                   ScalarType* const sample_values,
0531                                   ScalarType* const sample_value_errors,
0532                                   bool* const useless_samples,
0533                                   ScalarType* const g_tMaxAlphaBeta,
0534                                   ScalarType* const g_tMaxErrorAlphaBeta,
0535                                   ScalarType* const g_accTimeMax,
0536                                   ScalarType* const g_accTimeWgt,
0537                                   ScalarType* const sumAAsNullHypot,
0538                                   ScalarType* const sum0sNullHypot,
0539                                   ScalarType* const chi2sNullHypot,
0540                                   TimeComputationState* g_state,
0541                                   ScalarType* g_ampMaxAlphaBeta,
0542                                   ScalarType* g_ampMaxError,
0543                                   ScalarType* g_timeMax,
0544                                   ScalarType* g_timeError,
0545                                   EcalMultifitParametersDevice::ConstView paramsDev) const {
0546       /// launch ctx parameters are
0547       /// 10 threads per channel, N channels per block, Y blocks
0548       /// TODO: do we need to keep the state around or can be removed?!
0549       //#define DEBUG_FINDAMPLCHI2_AND_FINISH
0550 
0551       // constants
0552       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0553       auto const nchannels = digisDevEB.size() + digisDevEE.size();
0554       auto const offsetForInputs = digisDevEB.size();
0555 
0556       auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0557 
0558       // configure shared mem
0559       // per block, we need #threads per block * 2 * sizeof(ScalarType)
0560       // we run with N channels per block
0561       auto* shr_sumAf = alpaka::getDynSharedMem<ScalarType>(acc);
0562       auto* shr_sumff = shr_sumAf + elemsPerBlock;
0563 
0564       for (auto gtxforward : cms::alpakatools::uniform_elements(acc, nchannels * nsamples)) {
0565         // go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution
0566         auto gtx = nchannels * nsamples - 1 - gtxforward;
0567         auto const ch = gtx / nsamples;
0568         auto const elemIdx = gtx % elemsPerBlock;
0569         auto const sample = elemIdx % nsamples;
0570 
0571         auto const* dids = ch >= offsetForInputs ? digisDevEE.id() : digisDevEB.id();
0572         auto const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
0573 
0574         auto state = g_state[ch];
0575         auto const did = DetId{dids[inputCh]};
0576         auto* const amplitudeFitParameters = did.subdetId() == EcalBarrel ? paramsDev.amplitudeFitParamsEB().data()
0577                                                                           : paramsDev.amplitudeFitParamsEE().data();
0578 
0579         // TODO is that better than storing into global and launching another kernel
0580         // for the first 10 threads
0581         if (state == TimeComputationState::NotFinished) {
0582           auto const alpha = amplitudeFitParameters[0];
0583           auto const beta = amplitudeFitParameters[1];
0584           auto const alphabeta = alpha * beta;
0585           auto const invalphabeta = 1. / alphabeta;
0586           auto const tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
0587           auto const sample_value = sample_values[gtx];
0588           auto const sample_value_error = sample_value_errors[gtx];
0589           auto const inverr2 =
0590               useless_samples[gtx] ? static_cast<ScalarType>(0) : 1. / (sample_value_error * sample_value_error);
0591           auto const offset = (static_cast<ScalarType>(sample) - tMaxAlphaBeta) * invalphabeta;
0592           auto const term1 = 1. + offset;
0593           auto const f = term1 > 1e-6 ? fast_expf(alpha * (fast_logf(term1) - offset)) : static_cast<ScalarType>(0.);
0594           auto const sumAf = sample_value * (f * inverr2);
0595           auto const sumff = f * (f * inverr2);
0596 
0597           // store into shared mem
0598           shr_sumAf[elemIdx] = sumAf;
0599           shr_sumff[elemIdx] = sumff;
0600         } else {
0601           shr_sumAf[elemIdx] = 0;
0602           shr_sumff[elemIdx] = 0;
0603         }
0604 
0605         alpaka::syncBlockThreads(acc);
0606 
0607         // reduce
0608         // unroll completely here (but hardcoded)
0609         if (sample < 5) {
0610           shr_sumAf[elemIdx] += shr_sumAf[elemIdx + 5];
0611           shr_sumff[elemIdx] += shr_sumff[elemIdx + 5];
0612         }
0613 
0614         alpaka::syncBlockThreads(acc);
0615 
0616         if (sample < 2) {
0617           // will need to subtract for ltx = 3, we double count here
0618           shr_sumAf[elemIdx] += shr_sumAf[elemIdx + 2] + shr_sumAf[elemIdx + 3];
0619           shr_sumff[elemIdx] += shr_sumff[elemIdx + 2] + shr_sumff[elemIdx + 3];
0620         }
0621 
0622         alpaka::syncBlockThreads(acc);
0623 
0624         if (sample == 0) {
0625           // exit if the state is done
0626           // note, we do not exit before all __synchtreads are finished
0627           if (state == TimeComputationState::Finished) {
0628             g_timeMax[ch] = 5;
0629             g_timeError[ch] = -999;
0630             continue;
0631           }
0632 
0633           // subtract to avoid double counting
0634           auto const sumff = shr_sumff[elemIdx] + shr_sumff[elemIdx + 1] - shr_sumff[elemIdx + 3];
0635           auto const sumAf = shr_sumAf[elemIdx] + shr_sumAf[elemIdx + 1] - shr_sumAf[elemIdx + 3];
0636 
0637           auto const ampMaxAlphaBeta = sumff > 0 ? sumAf / sumff : 0;
0638           auto const sumAA = sumAAsNullHypot[ch];
0639           auto const sum0 = sum0sNullHypot[ch];
0640           auto const nullChi2 = chi2sNullHypot[ch];
0641           if (sumff > 0) {
0642             auto const chi2AlphaBeta = (sumAA - sumAf * sumAf / sumff) / sum0;
0643             if (chi2AlphaBeta > nullChi2) {
0644               // null hypothesis is better
0645               state = TimeComputationState::Finished;
0646 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0647               printf("ch = %d chi2AlphaBeta = %f nullChi2 = %f sumAA = %f sumAf = %f sumff = %f sum0 = %f\n",
0648                      ch,
0649                      chi2AlphaBeta,
0650                      nullChi2,
0651                      sumAA,
0652                      sumAf,
0653                      sumff,
0654                      sum0);
0655 #endif
0656             }
0657 
0658             // store to global
0659             g_ampMaxAlphaBeta[ch] = ampMaxAlphaBeta;
0660           } else {
0661 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0662             printf("ch = %d sum0 = %f sumAA = %f sumff = %f sumAf = %f\n", ch, sum0, sumAA, sumff, sumAf);
0663 #endif
0664             state = TimeComputationState::Finished;
0665           }
0666 
0667           // store the state to global and finish calcs
0668           g_state[ch] = state;
0669           if (state == TimeComputationState::Finished) {
0670             // store default values into global
0671             g_timeMax[ch] = 5;
0672             g_timeError[ch] = -999;
0673 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0674             printf("ch = %d finished state\n", ch);
0675 #endif
0676             continue;
0677           }
0678 
0679           auto const ampMaxError = g_ampMaxError[ch];
0680           auto const test_ratio = ampMaxAlphaBeta / ampMaxError;
0681           auto const accTimeMax = g_accTimeMax[ch];
0682           auto const accTimeWgt = g_accTimeWgt[ch];
0683           auto const tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
0684           auto const tMaxErrorAlphaBeta = g_tMaxErrorAlphaBeta[ch];
0685           // branch to separate large vs small pulses
0686           // see cpu version for more info
0687           if (test_ratio > 5. && accTimeWgt > 0) {
0688             auto const tMaxRatio = accTimeWgt > 0 ? accTimeMax / accTimeWgt : static_cast<ScalarType>(0);
0689             auto const tMaxErrorRatio = accTimeWgt > 0 ? 1. / std::sqrt(accTimeWgt) : static_cast<ScalarType>(0);
0690 
0691             if (test_ratio > 10.) {
0692               g_timeMax[ch] = tMaxRatio;
0693               g_timeError[ch] = tMaxErrorRatio;
0694 
0695 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0696               printf("ch = %d tMaxRatio = %f tMaxErrorRatio = %f\n", ch, tMaxRatio, tMaxErrorRatio);
0697 #endif
0698             } else {
0699               auto const timeMax = (tMaxAlphaBeta * (10. - ampMaxAlphaBeta / ampMaxError) +
0700                                     tMaxRatio * (ampMaxAlphaBeta / ampMaxError - 5.)) /
0701                                    5.;
0702               auto const timeError = (tMaxErrorAlphaBeta * (10. - ampMaxAlphaBeta / ampMaxError) +
0703                                       tMaxErrorRatio * (ampMaxAlphaBeta / ampMaxError - 5.)) /
0704                                      5.;
0705               state = TimeComputationState::Finished;
0706               g_state[ch] = state;
0707               g_timeMax[ch] = timeMax;
0708               g_timeError[ch] = timeError;
0709 
0710 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0711               printf("ch = %d timeMax = %f timeError = %f\n", ch, timeMax, timeError);
0712 #endif
0713             }
0714           } else {
0715             state = TimeComputationState::Finished;
0716             g_state[ch] = state;
0717             g_timeMax[ch] = tMaxAlphaBeta;
0718             g_timeError[ch] = tMaxErrorAlphaBeta;
0719 
0720 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0721             printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f\n", ch, tMaxAlphaBeta, tMaxErrorAlphaBeta);
0722 #endif
0723           }
0724         }
0725       }
0726     }
0727   };
0728 
0729   class Kernel_time_compute_fixMGPAslew {
0730     using ScalarType = ::ecal::multifit::SampleVector::Scalar;
0731 
0732   public:
0733     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0734     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0735                                   EcalDigiDeviceCollection::ConstView digisDevEB,
0736                                   EcalDigiDeviceCollection::ConstView digisDevEE,
0737                                   EcalMultifitConditionsDevice::ConstView conditionsDev,
0738                                   ScalarType* sample_values,
0739                                   ScalarType* sample_value_errors,
0740                                   bool* useless_sample_values) const {
0741       // constants
0742       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0743 
0744       auto const nchannelsEB = digisDevEB.size();
0745       auto const offsetForInputs = nchannelsEB;
0746 
0747       auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0748 
0749       for (auto gtx : cms::alpakatools::uniform_elements(acc, nchannelsEB * nsamples)) {
0750         auto const elemIdx = gtx % elemsPerBlock;
0751         auto const sample = elemIdx % nsamples;
0752         auto const ch = gtx / nsamples;
0753 
0754         // remove thread for sample 0, oversubscribing is easier than ....
0755         if (sample == 0)
0756           continue;
0757 
0758         if (!use_sample(conditionsDev.sampleMask_EB(), sample))
0759           continue;
0760 
0761         int const inputGtx = ch >= offsetForInputs ? gtx - offsetForInputs * nsamples : gtx;
0762         auto const* digis = ch >= offsetForInputs ? digisDevEE.data()->data() : digisDevEB.data()->data();
0763 
0764         auto const gainIdPrev = ecalMGPA::gainId(digis[inputGtx - 1]);
0765         auto const gainIdNext = ecalMGPA::gainId(digis[inputGtx]);
0766         if (gainIdPrev >= 1 && gainIdPrev <= 3 && gainIdNext >= 1 && gainIdNext <= 3 && gainIdPrev < gainIdNext) {
0767           sample_values[gtx - 1] = 0;
0768           sample_value_errors[gtx - 1] = 1e+9;
0769           useless_sample_values[gtx - 1] = true;
0770         }
0771       }
0772     }
0773   };
0774 
0775   //#define ECAL_RECO_ALPAKA_TC_INIT_DEBUG
0776   class Kernel_time_computation_init {
0777     using ScalarType = ::ecal::multifit::SampleVector::Scalar;
0778 
0779   public:
0780     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0781     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0782                                   EcalDigiDeviceCollection::ConstView digisDevEB,
0783                                   EcalDigiDeviceCollection::ConstView digisDevEE,
0784                                   EcalMultifitConditionsDevice::ConstView conditionsDev,
0785                                   ScalarType* sample_values,
0786                                   ScalarType* sample_value_errors,
0787                                   ScalarType* ampMaxError,
0788                                   bool* useless_sample_values,
0789                                   char* pedestal_nums) const {
0790       // constants
0791       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0792 
0793       // indices
0794       auto const nchannelsEB = digisDevEB.size();
0795       auto const nchannels = nchannelsEB + digisDevEE.size();
0796       auto const offsetForInputs = nchannelsEB;
0797       auto const offsetForHashes = conditionsDev.offsetEE();
0798 
0799       auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0800 
0801       // configure shared mem
0802       auto* shrSampleValues = alpaka::getDynSharedMem<ScalarType>(acc);
0803       auto* shrSampleValueErrors = shrSampleValues + elemsPerBlock;
0804 
0805       for (auto txforward : cms::alpakatools::uniform_elements(acc, nchannels * nsamples)) {
0806         // go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution
0807         auto tx = nchannels * nsamples - 1 - txforward;
0808         auto const ch = tx / nsamples;
0809         auto const elemIdx = tx % elemsPerBlock;
0810 
0811         int const inputTx = ch >= offsetForInputs ? tx - offsetForInputs * nsamples : tx;
0812         int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
0813         auto const* digis = ch >= offsetForInputs ? digisDevEE.data()->data() : digisDevEB.data()->data();
0814         auto const* dids = ch >= offsetForInputs ? digisDevEE.id() : digisDevEB.id();
0815 
0816         // indices/inits
0817         auto const sample = tx % nsamples;
0818         auto const input_ch_start = inputCh * nsamples;
0819         ScalarType pedestal = 0.;
0820         int num = 0;
0821 
0822         // 0 and 1 sample values
0823         auto const adc0 = ecalMGPA::adc(digis[input_ch_start]);
0824         auto const gainId0 = ecalMGPA::gainId(digis[input_ch_start]);
0825         auto const adc1 = ecalMGPA::adc(digis[input_ch_start + 1]);
0826         auto const gainId1 = ecalMGPA::gainId(digis[input_ch_start + 1]);
0827         auto const did = DetId{dids[inputCh]};
0828         auto const isBarrel = did.subdetId() == EcalBarrel;
0829         auto const sample_mask = isBarrel ? conditionsDev.sampleMask_EB() : conditionsDev.sampleMask_EE();
0830         auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
0831                                        : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
0832 
0833         // set pedestal
0834         // TODO this branch is non-divergent for a group of 10 threads
0835         if (gainId0 == 1 && use_sample(sample_mask, 0)) {
0836           pedestal = static_cast<ScalarType>(adc0);
0837           num = 1;
0838 
0839           auto const diff = adc1 - adc0;
0840           if (gainId1 == 1 && use_sample(sample_mask, 1) &&
0841               std::abs(diff) < 3 * conditionsDev.pedestals_rms_x12()[hashedId]) {
0842             pedestal = (pedestal + static_cast<ScalarType>(adc1)) / 2.;
0843             num = 2;
0844           }
0845         } else {
0846           pedestal = conditionsDev.pedestals_mean_x12()[ch];
0847         }
0848 
0849         // ped subtracted and gain-renormalized samples.
0850         auto const gainId = ecalMGPA::gainId(digis[inputTx]);
0851         auto const adc = ecalMGPA::adc(digis[inputTx]);
0852 
0853         bool bad = false;
0854         ScalarType sample_value, sample_value_error;
0855         // TODO divergent branch
0856         // TODO: piece below is general both for amplitudes and timing
0857         // potentially there is a way to reduce the amount of code...
0858         if (!use_sample(sample_mask, sample)) {
0859           bad = true;
0860           sample_value = 0;
0861           sample_value_error = 0;
0862         } else if (gainId == 1) {
0863           sample_value = static_cast<ScalarType>(adc) - pedestal;
0864           sample_value_error = conditionsDev.pedestals_rms_x12()[hashedId];
0865         } else if (gainId == 2) {
0866           auto const mean_x6 = conditionsDev.pedestals_mean_x6()[hashedId];
0867           auto const rms_x6 = conditionsDev.pedestals_rms_x6()[hashedId];
0868           auto const gain12Over6 = conditionsDev.gain12Over6()[hashedId];
0869           sample_value = (static_cast<ScalarType>(adc) - mean_x6) * gain12Over6;
0870           sample_value_error = rms_x6 * gain12Over6;
0871         } else if (gainId == 3) {
0872           auto const mean_x1 = conditionsDev.pedestals_mean_x1()[hashedId];
0873           auto const rms_x1 = conditionsDev.pedestals_rms_x1()[hashedId];
0874           auto const gain12Over6 = conditionsDev.gain12Over6()[hashedId];
0875           auto const gain6Over1 = conditionsDev.gain6Over1()[hashedId];
0876           sample_value = (static_cast<ScalarType>(adc) - mean_x1) * gain6Over1 * gain12Over6;
0877           sample_value_error = rms_x1 * gain6Over1 * gain12Over6;
0878         } else {
0879           sample_value = 0;
0880           sample_value_error = 0;
0881           bad = true;
0882         }
0883 
0884         // TODO: make sure we save things correctly when sample is useless
0885         auto const useless_sample = (sample_value_error <= 0) | bad;
0886         useless_sample_values[tx] = useless_sample;
0887         sample_values[tx] = sample_value;
0888         sample_value_errors[tx] = useless_sample ? 1e+9 : sample_value_error;
0889 
0890         // DEBUG
0891 #ifdef ECAL_RECO_ALPAKA_TC_INIT_DEBUG
0892         if (ch == 0) {
0893           printf("sample = %d sample_value = %f sample_value_error = %f useless = %c\n",
0894                  sample,
0895                  sample_value,
0896                  sample_value_error,
0897                  useless_sample ? '1' : '0');
0898         }
0899 #endif
0900 
0901         // store into the shared mem
0902         shrSampleValues[elemIdx] = sample_value_error > 0 ? sample_value : std::numeric_limits<ScalarType>::min();
0903         shrSampleValueErrors[elemIdx] = sample_value_error;
0904         alpaka::syncBlockThreads(acc);
0905 
0906         // perform the reduction with min
0907         if (sample < 5) {
0908           // note, if equal -> we keep the value with lower sample as for cpu
0909           shrSampleValueErrors[elemIdx] = shrSampleValues[elemIdx] < shrSampleValues[elemIdx + 5]
0910                                               ? shrSampleValueErrors[elemIdx + 5]
0911                                               : shrSampleValueErrors[elemIdx];
0912           shrSampleValues[elemIdx] = std::max(shrSampleValues[elemIdx], shrSampleValues[elemIdx + 5]);
0913         }
0914         alpaka::syncBlockThreads(acc);
0915 
0916         // a bit of an overkill, but easier than to compare across 3 values
0917         if (sample < 3) {
0918           shrSampleValueErrors[elemIdx] = shrSampleValues[elemIdx] < shrSampleValues[elemIdx + 3]
0919                                               ? shrSampleValueErrors[elemIdx + 3]
0920                                               : shrSampleValueErrors[elemIdx];
0921           shrSampleValues[elemIdx] = std::max(shrSampleValues[elemIdx], shrSampleValues[elemIdx + 3]);
0922         }
0923         alpaka::syncBlockThreads(acc);
0924 
0925         if (sample < 2) {
0926           shrSampleValueErrors[elemIdx] = shrSampleValues[elemIdx] < shrSampleValues[elemIdx + 2]
0927                                               ? shrSampleValueErrors[elemIdx + 2]
0928                                               : shrSampleValueErrors[elemIdx];
0929           shrSampleValues[elemIdx] = std::max(shrSampleValues[elemIdx], shrSampleValues[elemIdx + 2]);
0930         }
0931         alpaka::syncBlockThreads(acc);
0932 
0933         if (sample == 0) {
0934           // we only need the max error
0935           auto const maxSampleValueError = shrSampleValues[elemIdx] < shrSampleValues[elemIdx + 1]
0936                                                ? shrSampleValueErrors[elemIdx + 1]
0937                                                : shrSampleValueErrors[elemIdx];
0938 
0939           // # pedestal samples used
0940           pedestal_nums[ch] = num;
0941           // this is used downstream
0942           ampMaxError[ch] = maxSampleValueError;
0943 
0944           // DEBUG
0945 #ifdef ECAL_RECO_ALPAKA_TC_INIT_DEBUG
0946           if (ch == 0) {
0947             printf("pedestal_nums = %d ampMaxError = %f\n", num, maxSampleValueError);
0948           }
0949 #endif
0950         }
0951       }
0952     }
0953   };
0954 
0955   ///
0956   /// launch context parameters: 1 thread per channel
0957   ///
0958   //#define DEBUG_TIME_CORRECTION
0959   class Kernel_time_correction_and_finalize {
0960     using ScalarType = ::ecal::multifit::SampleVector::Scalar;
0961 
0962   public:
0963     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0964     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0965                                   EcalDigiDeviceCollection::ConstView digisDevEB,
0966                                   EcalDigiDeviceCollection::ConstView digisDevEE,
0967                                   EcalUncalibratedRecHitDeviceCollection::View uncalibRecHitsEB,
0968                                   EcalUncalibratedRecHitDeviceCollection::View uncalibRecHitsEE,
0969                                   EcalMultifitConditionsDevice::ConstView conditionsDev,
0970                                   ScalarType* const g_timeMax,
0971                                   ScalarType* const g_timeError,
0972                                   ConfigurationParameters::type const timeConstantTermEB,
0973                                   ConfigurationParameters::type const timeConstantTermEE,
0974                                   ConfigurationParameters::type const timeNconstEB,
0975                                   ConfigurationParameters::type const timeNconstEE,
0976                                   ConfigurationParameters::type const amplitudeThresholdEB,
0977                                   ConfigurationParameters::type const amplitudeThresholdEE,
0978                                   ConfigurationParameters::type const outOfTimeThreshG12pEB,
0979                                   ConfigurationParameters::type const outOfTimeThreshG12pEE,
0980                                   ConfigurationParameters::type const outOfTimeThreshG12mEB,
0981                                   ConfigurationParameters::type const outOfTimeThreshG12mEE,
0982                                   ConfigurationParameters::type const outOfTimeThreshG61pEB,
0983                                   ConfigurationParameters::type const outOfTimeThreshG61pEE,
0984                                   ConfigurationParameters::type const outOfTimeThreshG61mEB,
0985                                   ConfigurationParameters::type const outOfTimeThreshG61mEE) const {
0986       // constants
0987       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0988       auto const nchannelsEB = digisDevEB.size();
0989       auto const nchannels = nchannelsEB + digisDevEE.size();
0990       auto const offsetForInputs = nchannelsEB;
0991       auto const offsetForHashes = conditionsDev.offsetEE();
0992 
0993       for (auto gtx : cms::alpakatools::uniform_elements(acc, nchannels)) {
0994         const int inputGtx = gtx >= offsetForInputs ? gtx - offsetForInputs : gtx;
0995         auto const* dids = gtx >= offsetForInputs ? digisDevEE.id() : digisDevEB.id();
0996         auto const* digis = gtx >= offsetForInputs ? digisDevEE.data()->data() : digisDevEB.data()->data();
0997 
0998         auto* g_amplitude = gtx >= nchannelsEB ? uncalibRecHitsEE.amplitude() : uncalibRecHitsEB.amplitude();
0999         auto* g_jitter = gtx >= nchannelsEB ? uncalibRecHitsEE.jitter() : uncalibRecHitsEB.jitter();
1000         auto* g_jitterError = gtx >= nchannelsEB ? uncalibRecHitsEE.jitterError() : uncalibRecHitsEB.jitterError();
1001         auto* flags = gtx >= nchannelsEB ? uncalibRecHitsEE.flags() : uncalibRecHitsEB.flags();
1002 
1003         auto const did = DetId{dids[inputGtx]};
1004         auto const isBarrel = did.subdetId() == EcalBarrel;
1005         auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
1006                                        : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
1007         // need to access the underlying data directly here because the std::arrays have different size for EB and EE, which is not compatible with the ? operator
1008         auto* const amplitudeBins = isBarrel ? conditionsDev.timeBiasCorrections_amplitude_EB().data()
1009                                              : conditionsDev.timeBiasCorrections_amplitude_EE().data();
1010         auto* const shiftBins = isBarrel ? conditionsDev.timeBiasCorrections_shift_EB().data()
1011                                          : conditionsDev.timeBiasCorrections_shift_EE().data();
1012         auto const amplitudeBinsSize =
1013             isBarrel ? conditionsDev.timeBiasCorrectionSizeEB() : conditionsDev.timeBiasCorrectionSizeEE();
1014         auto const timeConstantTerm = isBarrel ? timeConstantTermEB : timeConstantTermEE;
1015         auto const timeNconst = isBarrel ? timeNconstEB : timeNconstEE;
1016         auto const offsetTimeValue = isBarrel ? conditionsDev.timeOffset_EB() : conditionsDev.timeOffset_EE();
1017         auto const amplitudeThreshold = isBarrel ? amplitudeThresholdEB : amplitudeThresholdEE;
1018         auto const outOfTimeThreshG12p = isBarrel ? outOfTimeThreshG12pEB : outOfTimeThreshG12pEE;
1019         auto const outOfTimeThreshG12m = isBarrel ? outOfTimeThreshG12mEB : outOfTimeThreshG12mEE;
1020         auto const outOfTimeThreshG61p = isBarrel ? outOfTimeThreshG61pEB : outOfTimeThreshG61pEE;
1021         auto const outOfTimeThreshG61m = isBarrel ? outOfTimeThreshG61mEB : outOfTimeThreshG61mEE;
1022 
1023         // load some
1024         auto const amplitude = g_amplitude[inputGtx];
1025         auto const rms_x12 = conditionsDev.pedestals_rms_x12()[hashedId];
1026         auto const timeCalibConst = conditionsDev.timeCalibConstants()[hashedId];
1027 
1028         int myBin = -1;
1029         for (size_t bin = 0; bin < amplitudeBinsSize; ++bin) {
1030           if (amplitude > amplitudeBins[bin])
1031             myBin = bin;
1032           else
1033             break;
1034         }
1035 
1036         ScalarType correction = 0;
1037         if (myBin == -1) {
1038           correction = shiftBins[0];
1039         } else if (myBin == static_cast<int>(amplitudeBinsSize) - 1) {
1040           correction = shiftBins[myBin];
1041         } else {
1042           correction = shiftBins[myBin + 1] - shiftBins[myBin];
1043           correction *= (amplitude - amplitudeBins[myBin]) / (amplitudeBins[myBin + 1] - amplitudeBins[myBin]);
1044           correction += shiftBins[myBin];
1045         }
1046 
1047         // correction * 1./25.
1048         correction = correction * 0.04;
1049         auto const timeMax = g_timeMax[gtx];
1050         auto const timeError = g_timeError[gtx];
1051         auto const jitter = timeMax - 5 + correction;
1052         auto const jitterError =
1053             std::sqrt(timeError * timeError + timeConstantTerm * timeConstantTerm * 0.04 * 0.04);  // 0.04 = 1./25.
1054 
1055 #ifdef DEBUG_TIME_CORRECTION
1056         printf("ch = %d timeMax = %f timeError = %f jitter = %f correction = %f\n",
1057                gtx,
1058                timeMax,
1059                timeError,
1060                jitter,
1061                correction);
1062 #endif
1063 
1064         // store back to  global
1065         g_jitter[inputGtx] = jitter;
1066         g_jitterError[inputGtx] = jitterError;
1067 
1068         // set the flag
1069         // TODO: replace with something more efficient (if required),
1070         // for now just to make it work
1071         if (amplitude > amplitudeThreshold * rms_x12) {
1072           auto threshP = outOfTimeThreshG12p;
1073           auto threshM = outOfTimeThreshG12m;
1074           if (amplitude > 3000.) {
1075             for (int isample = 0; isample < nsamples; isample++) {
1076               auto const gainid = ecalMGPA::gainId(digis[nsamples * inputGtx + isample]);
1077               if (gainid != 1) {
1078                 threshP = outOfTimeThreshG61p;
1079                 threshM = outOfTimeThreshG61m;
1080                 break;
1081               }
1082             }
1083           }
1084 
1085           auto const correctedTime = (timeMax - 5) * 25 + timeCalibConst + offsetTimeValue;
1086           auto const nterm = timeNconst * rms_x12 / amplitude;
1087           auto const sigmat = std::sqrt(nterm * nterm + timeConstantTerm * timeConstantTerm);
1088           if (correctedTime > sigmat * threshP || correctedTime < -sigmat * threshM)
1089             flags[inputGtx] |= 0x1 << EcalUncalibratedRecHit::kOutOfTime;
1090         }
1091       }
1092     }
1093   };
1094 
1095 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit
1096 
1097 namespace alpaka::trait {
1098   using namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit;
1099 
1100   //! The trait for getting the size of the block shared dynamic memory for Kernel_time_compute_nullhypot.
1101   template <typename TAcc>
1102   struct BlockSharedMemDynSizeBytes<Kernel_time_compute_nullhypot, TAcc> {
1103     //! \return The size of the shared memory allocated for a block.
1104     template <typename TVec, typename... TArgs>
1105     ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_time_compute_nullhypot const&,
1106                                                                  TVec const& threadsPerBlock,
1107                                                                  TVec const& elemsPerThread,
1108                                                                  TArgs const&...) -> std::size_t {
1109       using ScalarType = ecal::multifit::SampleVector::Scalar;
1110 
1111       // return the amount of dynamic shared memory needed
1112       std::size_t bytes = threadsPerBlock[0u] * elemsPerThread[0u] * 4 * sizeof(ScalarType);
1113       return bytes;
1114     }
1115   };
1116 
1117   //! The trait for getting the size of the block shared dynamic memory for Kernel_time_compute_makeratio.
1118   template <typename TAcc>
1119   struct BlockSharedMemDynSizeBytes<Kernel_time_compute_makeratio, TAcc> {
1120     template <typename TVec, typename... TArgs>
1121     ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_time_compute_makeratio const&,
1122                                                                  TVec const& threadsPerBlock,
1123                                                                  TVec const& elemsPerThread,
1124                                                                  TArgs const&...) -> std::size_t {
1125       using ScalarType = ecal::multifit::SampleVector::Scalar;
1126 
1127       std::size_t bytes = (8 * sizeof(ScalarType) + 3 * sizeof(bool)) * threadsPerBlock[0u] * elemsPerThread[0u];
1128       return bytes;
1129     }
1130   };
1131 
1132   //! The trait for getting the size of the block shared dynamic memory for Kernel_time_compute_findamplchi2_and_finish.
1133   template <typename TAcc>
1134   struct BlockSharedMemDynSizeBytes<Kernel_time_compute_findamplchi2_and_finish, TAcc> {
1135     template <typename TVec, typename... TArgs>
1136     ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_time_compute_findamplchi2_and_finish const&,
1137                                                                  TVec const& threadsPerBlock,
1138                                                                  TVec const& elemsPerThread,
1139                                                                  TArgs const&...) -> std::size_t {
1140       using ScalarType = ecal::multifit::SampleVector::Scalar;
1141 
1142       std::size_t bytes = 2 * threadsPerBlock[0u] * elemsPerThread[0u] * sizeof(ScalarType);
1143       return bytes;
1144     }
1145   };
1146 
1147   //! The trait for getting the size of the block shared dynamic memory for Kernel_time_computation_init.
1148   template <typename TAcc>
1149   struct BlockSharedMemDynSizeBytes<Kernel_time_computation_init, TAcc> {
1150     template <typename TVec, typename... TArgs>
1151     ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_time_computation_init const&,
1152                                                                  TVec const& threadsPerBlock,
1153                                                                  TVec const& elemsPerThread,
1154                                                                  TArgs const&...) -> std::size_t {
1155       using ScalarType = ecal::multifit::SampleVector::Scalar;
1156 
1157       std::size_t bytes = 2 * threadsPerBlock[0u] * elemsPerThread[0u] * sizeof(ScalarType);
1158       return bytes;
1159     }
1160   };
1161 
1162 }  // namespace alpaka::trait
1163 
1164 #endif  // RecoLocalCalo_EcalRecProducers_plugins_TimeComputationKernels_h