Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2025-01-22 07:34:22

0001 #ifndef RecoLocalCalo_EcalRecProducers_plugins_alpaka_TimeComputationKernels_h
0002 #define RecoLocalCalo_EcalRecProducers_plugins_alpaka_TimeComputationKernels_h
0003 
0004 #include <cassert>
0005 #include <cmath>
0006 #include <limits>
0007 
0008 #include "CondFormats/EcalObjects/interface/alpaka/EcalMultifitConditionsDevice.h"
0009 #include "DataFormats/EcalDigi/interface/EcalDataFrame.h"
0010 #include "DataFormats/EcalDigi/interface/EcalMGPASample.h"
0011 #include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
0012 #include "DataFormats/Math/interface/approx_exp.h"
0013 #include "DataFormats/Math/interface/approx_log.h"
0014 #include "FWCore/Utilities/interface/CMSUnrollLoop.h"
0015 #include "RecoLocalCalo/EcalRecProducers/interface/EigenMatrixTypes_gpu.h"
0016 
0017 #include "DeclsForKernels.h"
0018 #include "KernelHelpers.h"
0019 #include "EcalMultifitParameters.h"
0020 
0021 //#define ECAL_RECO_ALPAKA_DEBUG
0022 
0023 namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit {
0024 
0025   ALPAKA_FN_ACC ALPAKA_FN_INLINE bool use_sample(unsigned int sample_mask, unsigned int sample) {
0026     return sample_mask & (0x1 << (EcalDataFrame::MAXSAMPLES - (sample + 1)));
0027   }
0028 
0029   ALPAKA_FN_ACC constexpr float fast_expf(float x) { return unsafe_expf<6>(x); }
0030   ALPAKA_FN_ACC constexpr float fast_logf(float x) { return unsafe_logf<7>(x); }
0031 
0032   class Kernel_time_compute_nullhypot {
0033     using ScalarType = ::ecal::multifit::SampleVector::Scalar;
0034 
0035   public:
0036     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0037                                   ScalarType* const sample_values,
0038                                   ScalarType* const sample_value_errors,
0039                                   bool* const useless_sample_values,
0040                                   ScalarType* chi2s,
0041                                   ScalarType* sum0s,
0042                                   ScalarType* sumAAs,
0043                                   uint32_t const nchannels) const {
0044       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0045 
0046       // indices
0047       auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0048 
0049       // shared mem inits
0050       auto* s_sum0 = alpaka::getDynSharedMem<char>(acc);
0051       auto* s_sum1 = reinterpret_cast<ScalarType*>(s_sum0 + elemsPerBlock);
0052       auto* s_sumA = s_sum1 + elemsPerBlock;
0053       auto* s_sumAA = s_sumA + elemsPerBlock;
0054 
0055       for (auto txforward : cms::alpakatools::uniform_elements(acc, nchannels * nsamples)) {
0056         // go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution
0057         auto tx = nchannels * nsamples - 1 - txforward;
0058         auto const ch = tx / nsamples;
0059 
0060         auto const sample = tx % nsamples;
0061         auto const ltx = tx % elemsPerBlock;
0062 
0063         // TODO make sure no div by 0
0064         auto const inv_error =
0065             useless_sample_values[tx] ? 0. : 1. / (sample_value_errors[tx] * sample_value_errors[tx]);
0066         auto const sample_value = sample_values[tx];
0067         s_sum0[ltx] = useless_sample_values[tx] ? 0 : 1;
0068         s_sum1[ltx] = inv_error;
0069         s_sumA[ltx] = sample_value * inv_error;
0070         s_sumAA[ltx] = sample_value * sample_value * inv_error;
0071         alpaka::syncBlockThreads(acc);
0072 
0073         // 5 threads for [0, 4] samples
0074         if (sample < 5) {
0075           s_sum0[ltx] += s_sum0[ltx + 5];
0076           s_sum1[ltx] += s_sum1[ltx + 5];
0077           s_sumA[ltx] += s_sumA[ltx + 5];
0078           s_sumAA[ltx] += s_sumAA[ltx + 5];
0079         }
0080         alpaka::syncBlockThreads(acc);
0081 
0082         if (sample < 2) {
0083           // note double counting of sample 3
0084           s_sum0[ltx] += s_sum0[ltx + 2] + s_sum0[ltx + 3];
0085           s_sum1[ltx] += s_sum1[ltx + 2] + s_sum1[ltx + 3];
0086           s_sumA[ltx] += s_sumA[ltx + 2] + s_sumA[ltx + 3];
0087           s_sumAA[ltx] += s_sumAA[ltx + 2] + s_sumAA[ltx + 3];
0088         }
0089         alpaka::syncBlockThreads(acc);
0090 
0091         if (sample == 0) {
0092           // note, subtract to remove the double counting of sample == 3
0093           auto const sum0 = s_sum0[ltx] + s_sum0[ltx + 1] - s_sum0[ltx + 3];
0094           auto const sum1 = s_sum1[ltx] + s_sum1[ltx + 1] - s_sum1[ltx + 3];
0095           auto const sumA = s_sumA[ltx] + s_sumA[ltx + 1] - s_sumA[ltx + 3];
0096           auto const sumAA = s_sumAA[ltx] + s_sumAA[ltx + 1] - s_sumAA[ltx + 3];
0097           auto const chi2 = sum0 > 0 ? (sumAA - sumA * sumA / sum1) / sum0 : static_cast<ScalarType>(0);
0098           chi2s[ch] = chi2;
0099           sum0s[ch] = sum0;
0100           sumAAs[ch] = sumAA;
0101 
0102 #ifdef DEBUG_TC_NULLHYPOT
0103           if (ch == 0) {
0104             printf("chi2 = %f sum0 = %d sumAA = %f\n", chi2, static_cast<int>(sum0), sumAA);
0105           }
0106 #endif
0107         }
0108       }
0109     }
0110   };
0111 
0112   //
0113   // launch ctx parameters are
0114   // 45 threads per channel, X channels per block, Y blocks
0115   // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9
0116   // TODO: it might be much beter to use 32 threads per channel instead of 45
0117   // to simplify the synchronization
0118   class Kernel_time_compute_makeratio {
0119     using ScalarType = ::ecal::multifit::SampleVector::Scalar;
0120 
0121   public:
0122     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0123                                   EcalDigiDeviceCollection::ConstView digisDevEB,
0124                                   EcalDigiDeviceCollection::ConstView digisDevEE,
0125                                   ScalarType* const sample_values,
0126                                   ScalarType* const sample_value_errors,
0127                                   bool* const useless_sample_values,
0128                                   char* const pedestal_nums,
0129                                   ScalarType* const sumAAsNullHypot,
0130                                   ScalarType* const sum0sNullHypot,
0131                                   ScalarType* tMaxAlphaBetas,
0132                                   ScalarType* tMaxErrorAlphaBetas,
0133                                   ScalarType* g_accTimeMax,
0134                                   ScalarType* g_accTimeWgt,
0135                                   TimeComputationState* g_state,
0136                                   EcalMultifitParameters const* paramsDev,
0137                                   ConfigurationParameters::type const timeFitLimits_firstEB,
0138                                   ConfigurationParameters::type const timeFitLimits_firstEE,
0139                                   ConfigurationParameters::type const timeFitLimits_secondEB,
0140                                   ConfigurationParameters::type const timeFitLimits_secondEE) const {
0141       // constants
0142       constexpr uint32_t nchannels_per_block = 10;
0143       constexpr auto nthreads_per_channel = nchannels_per_block * (nchannels_per_block - 1) / 2;
0144       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0145       auto const nchannels = digisDevEB.size() + digisDevEE.size();
0146       auto const offsetForInputs = digisDevEB.size();
0147       auto const totalElements = nthreads_per_channel * nchannels;
0148 
0149       auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0150       ALPAKA_ASSERT_ACC(nthreads_per_channel * nchannels_per_block == elemsPerBlock);
0151 
0152       auto* shr_chi2s = alpaka::getDynSharedMem<ScalarType>(acc);
0153       auto* shr_time_wgt = shr_chi2s + elemsPerBlock;
0154       auto* shr_time_max = shr_time_wgt + elemsPerBlock;
0155       auto* shrTimeMax = shr_time_max + elemsPerBlock;
0156       auto* shrTimeWgt = shrTimeMax + elemsPerBlock;
0157       auto* shr_chi2 = shrTimeWgt + elemsPerBlock;
0158       auto* shr_tmax = shr_chi2 + elemsPerBlock;
0159       auto* shr_tmaxerr = shr_tmax + elemsPerBlock;
0160       auto* shr_condForUselessSamples = reinterpret_cast<bool*>(shr_tmaxerr + elemsPerBlock);
0161       auto* shr_internalCondForSkipping1 = shr_condForUselessSamples + elemsPerBlock;
0162       auto* shr_internalCondForSkipping2 = shr_internalCondForSkipping1 + elemsPerBlock;
0163 
0164       for (auto block : cms::alpakatools::uniform_groups(acc, totalElements)) {
0165         for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0166           auto const ch = idx.global / nthreads_per_channel;
0167           auto const ltx = idx.global % nthreads_per_channel;
0168 
0169           auto const ch_start = ch * nsamples;
0170           auto const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
0171           auto const* dids = ch >= offsetForInputs ? digisDevEE.id() : digisDevEB.id();
0172 
0173           auto const did = DetId{dids[inputCh]};
0174           auto const isBarrel = did.subdetId() == EcalBarrel;
0175           auto* const amplitudeFitParameters =
0176               isBarrel ? paramsDev->amplitudeFitParamsEB.data() : paramsDev->amplitudeFitParamsEE.data();
0177           auto* const timeFitParameters =
0178               isBarrel ? paramsDev->timeFitParamsEB.data() : paramsDev->timeFitParamsEE.data();
0179           auto const timeFitParameters_size =
0180               isBarrel ? paramsDev->timeFitParamsEB.size() : paramsDev->timeFitParamsEE.size();
0181           auto const timeFitLimits_first = isBarrel ? timeFitLimits_firstEB : timeFitLimits_firstEE;
0182           auto const timeFitLimits_second = isBarrel ? timeFitLimits_secondEB : timeFitLimits_secondEE;
0183 
0184           // map tx -> (sample_i, sample_j)
0185           int sample_i = 0;
0186           int sample_j = 0;
0187           if (ltx <= 8) {
0188             sample_i = 0;
0189             sample_j = 1 + ltx;
0190           } else if (ltx <= 16) {
0191             sample_i = 1;
0192             sample_j = 2 + ltx - 9;
0193           } else if (ltx <= 23) {
0194             sample_i = 2;
0195             sample_j = 3 + ltx - 17;
0196           } else if (ltx <= 29) {
0197             sample_i = 3;
0198             sample_j = 4 + ltx - 24;
0199           } else if (ltx <= 34) {
0200             sample_i = 4;
0201             sample_j = 5 + ltx - 30;
0202           } else if (ltx <= 38) {
0203             sample_i = 5;
0204             sample_j = 6 + ltx - 35;
0205           } else if (ltx <= 41) {
0206             sample_i = 6;
0207             sample_j = 7 + ltx - 39;
0208           } else if (ltx <= 43) {
0209             sample_i = 7;
0210             sample_j = 8 + ltx - 42;
0211           } else if (ltx <= 44) {
0212             sample_i = 8;
0213             sample_j = 9;
0214           } else {
0215             // FIXME this needs a more portable solution, that wraps abort() / __trap() / throw depending on the back-end
0216             ALPAKA_ASSERT_ACC(false);
0217           }
0218 
0219           auto const tx_i = ch_start + sample_i;
0220           auto const tx_j = ch_start + sample_j;
0221 
0222           //
0223           // note, given the way we partition the block, with 45 threads per channel
0224           // we will end up with inactive threads which need to be dragged along
0225           // through the synching point
0226           //
0227           bool const condForUselessSamples = useless_sample_values[tx_i] || useless_sample_values[tx_j] ||
0228                                              sample_values[tx_i] <= 1 || sample_values[tx_j] <= 1;
0229 
0230           //
0231           // see cpu implementation for explanation
0232           //
0233           ScalarType chi2 = std::numeric_limits<ScalarType>::max();
0234           ScalarType tmax = 0;
0235           ScalarType tmaxerr = 0;
0236           shrTimeMax[idx.local] = 0;
0237           shrTimeWgt[idx.local] = 0;
0238 
0239           bool internalCondForSkipping1 = true;
0240           bool internalCondForSkipping2 = true;
0241           if (!condForUselessSamples) {
0242             auto const rtmp = sample_values[tx_i] / sample_values[tx_j];
0243             auto const invampl_i = 1. / sample_values[tx_i];
0244             auto const relErr2_i = sample_value_errors[tx_i] * sample_value_errors[tx_i] * invampl_i * invampl_i;
0245             auto const invampl_j = 1. / sample_values[tx_j];
0246             auto const relErr2_j = sample_value_errors[tx_j] * sample_value_errors[tx_j] * invampl_j * invampl_j;
0247             auto const err1 = rtmp * rtmp * (relErr2_i + relErr2_j);
0248             auto err2 =
0249                 sample_value_errors[tx_j] * (sample_values[tx_i] - sample_values[tx_j]) * (invampl_j * invampl_j);
0250             // TODO non-divergent branch for a block if each block has 1 channel
0251             // otherwise non-divergent for groups of 45 threads
0252             // at this point, pedestal_nums[ch] can be either 0, 1 or 2
0253             if (pedestal_nums[ch] == 2)
0254               err2 *= err2 * 0.5;
0255             auto const err3 = (0.289 * 0.289) * (invampl_j * invampl_j);
0256             auto const total_error = std::sqrt(err1 + err2 + err3);
0257 
0258             auto const alpha = amplitudeFitParameters[0];
0259             auto const beta = amplitudeFitParameters[1];
0260             auto const alphabeta = alpha * beta;
0261             auto const invalphabeta = 1. / alphabeta;
0262 
0263             // variables instead of a struct
0264             auto const ratio_index = sample_i;
0265             auto const ratio_step = sample_j - sample_i;
0266             auto const ratio_value = rtmp;
0267             auto const ratio_error = total_error;
0268 
0269             auto const rlim_i_j = fast_expf(static_cast<ScalarType>(sample_j - sample_i) / beta) - 0.001;
0270             internalCondForSkipping1 = !(total_error < 1. && rtmp > 0.001 && rtmp < rlim_i_j);
0271             if (!internalCondForSkipping1) {
0272               //
0273               // precompute.
0274               // in cpu version this was done conditionally
0275               // however easier to do it here (precompute) and then just filter out
0276               // if not needed
0277               //
0278               auto const l_timeFitLimits_first = timeFitLimits_first;
0279               auto const l_timeFitLimits_second = timeFitLimits_second;
0280               if (ratio_step == 1 && ratio_value >= l_timeFitLimits_first && ratio_value <= l_timeFitLimits_second) {
0281                 auto const time_max_i = static_cast<ScalarType>(ratio_index);
0282                 auto u = timeFitParameters[timeFitParameters_size - 1];
0283                 CMS_UNROLL_LOOP
0284                 for (int k = timeFitParameters_size - 2; k >= 0; --k)
0285                   u = u * ratio_value + timeFitParameters[k];
0286 
0287                 auto du = (timeFitParameters_size - 1) * (timeFitParameters[timeFitParameters_size - 1]);
0288                 for (int k = timeFitParameters_size - 2; k >= 1; --k)
0289                   du = du * ratio_value + k * timeFitParameters[k];
0290 
0291                 auto const error2 = ratio_error * ratio_error * du * du;
0292                 auto const time_max = error2 > 0 ? (time_max_i - u) / error2 : static_cast<ScalarType>(0);
0293                 auto const time_wgt = error2 > 0 ? 1. / error2 : static_cast<ScalarType>(0);
0294 
0295                 // store into shared mem
0296                 // note, this name is essentially identical to the one used
0297                 // below.
0298                 shrTimeMax[idx.local] = error2 > 0 ? time_max : 0;
0299                 shrTimeWgt[idx.local] = error2 > 0 ? time_wgt : 0;
0300               } else {
0301                 shrTimeMax[idx.local] = 0;
0302                 shrTimeWgt[idx.local] = 0;
0303               }
0304 
0305               // continue with ratios
0306               auto const stepOverBeta = static_cast<ScalarType>(ratio_step) / beta;
0307               auto const offset = static_cast<ScalarType>(ratio_index) + alphabeta;
0308               auto const rmin = std::max(ratio_value - ratio_error, 0.001);
0309               auto const rmax =
0310                   std::min(ratio_value + ratio_error, fast_expf(static_cast<ScalarType>(ratio_step) / beta) - 0.001);
0311               auto const time1 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmin)) / alpha) - 1.);
0312               auto const time2 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmax)) / alpha) - 1.);
0313 
0314               // set these guys
0315               tmax = 0.5 * (time1 + time2);
0316               tmaxerr = 0.5 * std::sqrt((time1 - time2) * (time1 - time2));
0317 #ifdef DEBUG_TC_MAKERATIO
0318               if (ch == 1 || ch == 0)
0319                 printf(
0320                     "ch = %d ltx = %d tmax = %f tmaxerr = %f time1 = %f time2 = %f offset = %f rmin = %f rmax = "
0321                     "%f\n",
0322                     ch,
0323                     ltx,
0324                     tmax,
0325                     tmaxerr,
0326                     time1,
0327                     time2,
0328                     offset,
0329                     rmin,
0330                     rmax);
0331 #endif
0332 
0333               ScalarType sumAf = 0;
0334               ScalarType sumff = 0;
0335               const int itmin = std::max(-1, static_cast<int>(std::floor(tmax - alphabeta)));
0336               auto loffset = (static_cast<ScalarType>(itmin) - tmax) * invalphabeta;
0337               // TODO: data dependence
0338               for (int it = itmin + 1; it < nsamples; ++it) {
0339                 loffset += invalphabeta;
0340                 if (useless_sample_values[ch_start + it])
0341                   continue;
0342                 auto const inverr2 = 1. / (sample_value_errors[ch_start + it] * sample_value_errors[ch_start + it]);
0343                 auto const term1 = 1. + loffset;
0344                 auto const f = (term1 > 1e-6) ? fast_expf(alpha * (fast_logf(term1) - loffset)) : 0;
0345                 sumAf += sample_values[ch_start + it] * (f * inverr2);
0346                 sumff += f * (f * inverr2);
0347               }
0348 
0349               auto const sumAA = sumAAsNullHypot[ch];
0350               auto const sum0 = sum0sNullHypot[ch];
0351               chi2 = sumAA;
0352               // TODO: sum0 can not be 0 below, need to introduce the check upfront
0353               if (sumff > 0) {
0354                 chi2 = sumAA - sumAf * (sumAf / sumff);
0355               }
0356               chi2 /= sum0;
0357 
0358 #ifdef DEBUG_TC_MAKERATIO
0359               if (ch == 1 || ch == 0)
0360                 printf(
0361                     "ch = %d ltx = %d sumAf = %f sumff = %f sumAA = %f sum0 = %d tmax = %f tmaxerr = %f chi2 = "
0362                     "%f\n",
0363                     ch,
0364                     ltx,
0365                     sumAf,
0366                     sumff,
0367                     sumAA,
0368                     static_cast<int>(sum0),
0369                     tmax,
0370                     tmaxerr,
0371                     chi2);
0372 #endif
0373 
0374               if (chi2 > 0 && tmax > 0 && tmaxerr > 0)
0375                 internalCondForSkipping2 = false;
0376               else
0377                 chi2 = std::numeric_limits<ScalarType>::max();
0378             }
0379           }
0380 
0381           // store into smem
0382           shr_chi2s[idx.local] = chi2;
0383           shr_chi2[idx.local] = chi2;
0384           shr_tmax[idx.local] = tmax;
0385           shr_tmaxerr[idx.local] = tmaxerr;
0386           shr_condForUselessSamples[idx.local] = condForUselessSamples;
0387           shr_internalCondForSkipping1[idx.local] = internalCondForSkipping1;
0388           shr_internalCondForSkipping2[idx.local] = internalCondForSkipping2;
0389         }
0390 
0391         alpaka::syncBlockThreads(acc);
0392 
0393         // find min chi2 - quite crude for now
0394         // TODO validate/check
0395         auto iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
0396         bool oddElements = nthreads_per_channel % 2;
0397         CMS_UNROLL_LOOP
0398         while (iter >= 1) {
0399           for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0400             auto const ltx = idx.global % nthreads_per_channel;
0401 
0402             if (ltx < iter && !(oddElements && (ltx == iter - 1 && ltx > 0))) {
0403               // for odd ns, the last guy will just store itself
0404               // exception is for ltx == 0 and iter==1
0405               shr_chi2s[idx.local] = std::min(shr_chi2s[idx.local], shr_chi2s[idx.local + iter]);
0406             }
0407           }
0408           alpaka::syncBlockThreads(acc);
0409 
0410           oddElements = iter % 2;
0411           iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
0412         }
0413 
0414         for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0415           auto const ltx = idx.global % nthreads_per_channel;
0416 
0417           // get precomputedflags for this element from shared memory
0418           auto const condForUselessSamples = shr_condForUselessSamples[idx.local];
0419           auto const internalCondForSkipping1 = shr_internalCondForSkipping1[idx.local];
0420           auto const internalCondForSkipping2 = shr_internalCondForSkipping2[idx.local];
0421           // filter out inactive or useless samples threads
0422           if (!condForUselessSamples && !internalCondForSkipping1 && !internalCondForSkipping2) {
0423             // min chi2, now compute weighted average of tmax measurements
0424             // see cpu version for more explanation
0425             auto const chi2 = shr_chi2[idx.local];
0426             auto const chi2min = shr_chi2s[idx.local - ltx];
0427             auto const chi2Limit = chi2min + 1.;
0428             auto const tmaxerr = shr_tmaxerr[idx.local];
0429             auto const inverseSigmaSquared = chi2 < chi2Limit ? 1. / (tmaxerr * tmaxerr) : 0.;
0430 
0431 #ifdef DEBUG_TC_MAKERATIO
0432             if (ch == 1 || ch == 0) {
0433               auto const ch = idx.global / nthreads_per_channel;
0434               printf("ch = %d ltx = %d chi2min = %f chi2Limit = %f inverseSigmaSquared = %f\n",
0435                      ch,
0436                      ltx,
0437                      chi2min,
0438                      chi2Limit,
0439                      inverseSigmaSquared);
0440             }
0441 #endif
0442 
0443             // store into shared mem and run reduction
0444             // TODO: check if cooperative groups would be better
0445             // TODO: check if shuffling intrinsics are better
0446             auto const tmax = shr_tmax[idx.local];
0447             shr_time_wgt[idx.local] = inverseSigmaSquared;
0448             shr_time_max[idx.local] = tmax * inverseSigmaSquared;
0449           } else {
0450             shr_time_wgt[idx.local] = 0;
0451             shr_time_max[idx.local] = 0;
0452           }
0453         }
0454 
0455         alpaka::syncBlockThreads(acc);
0456 
0457         // reduce to compute time_max and time_wgt
0458         iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
0459         oddElements = nthreads_per_channel % 2;
0460         CMS_UNROLL_LOOP
0461         while (iter >= 1) {
0462           for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0463             auto const ltx = idx.global % nthreads_per_channel;
0464 
0465             if (ltx < iter && !(oddElements && (ltx == iter - 1 && ltx > 0))) {
0466               shr_time_wgt[idx.local] += shr_time_wgt[idx.local + iter];
0467               shr_time_max[idx.local] += shr_time_max[idx.local + iter];
0468               shrTimeMax[idx.local] += shrTimeMax[idx.local + iter];
0469               shrTimeWgt[idx.local] += shrTimeWgt[idx.local + iter];
0470             }
0471           }
0472 
0473           alpaka::syncBlockThreads(acc);
0474           oddElements = iter % 2;
0475           iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
0476         }
0477 
0478         for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
0479           auto const ltx = idx.global % nthreads_per_channel;
0480 
0481           // load from shared memory the 0th guy (will contain accumulated values)
0482           // compute
0483           // store into global mem
0484           if (ltx == 0) {
0485             auto const ch = idx.global / nthreads_per_channel;
0486             auto const tmp_time_max = shr_time_max[idx.local];
0487             auto const tmp_time_wgt = shr_time_wgt[idx.local];
0488 
0489             // we are done if there number of time ratios is 0
0490             if (tmp_time_wgt == 0 && tmp_time_max == 0) {
0491               g_state[ch] = TimeComputationState::Finished;
0492               continue;
0493             }
0494 
0495             // no div by 0
0496             auto const tMaxAlphaBeta = tmp_time_max / tmp_time_wgt;
0497             auto const tMaxErrorAlphaBeta = 1. / std::sqrt(tmp_time_wgt);
0498 
0499             tMaxAlphaBetas[ch] = tMaxAlphaBeta;
0500             tMaxErrorAlphaBetas[ch] = tMaxErrorAlphaBeta;
0501             g_accTimeMax[ch] = shrTimeMax[idx.local];
0502             g_accTimeWgt[ch] = shrTimeWgt[idx.local];
0503             g_state[ch] = TimeComputationState::NotFinished;
0504 
0505 #ifdef DEBUG_TC_MAKERATIO
0506             printf("ch = %d time_max = %f time_wgt = %f\n", ch, tmp_time_max, tmp_time_wgt);
0507             printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f timeMax = %f timeWgt = %f\n",
0508                    ch,
0509                    tMaxAlphaBeta,
0510                    tMaxErrorAlphaBeta,
0511                    shrTimeMax[idx.local],
0512                    shrTimeWgt[idx.local]);
0513 #endif
0514           }
0515         }
0516       }
0517     }
0518   };
0519 
0520   class Kernel_time_compute_findamplchi2_and_finish {
0521     using ScalarType = ::ecal::multifit::SampleVector::Scalar;
0522 
0523   public:
0524     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0525                                   EcalDigiDeviceCollection::ConstView digisDevEB,
0526                                   EcalDigiDeviceCollection::ConstView digisDevEE,
0527                                   ScalarType* const sample_values,
0528                                   ScalarType* const sample_value_errors,
0529                                   bool* const useless_samples,
0530                                   ScalarType* const g_tMaxAlphaBeta,
0531                                   ScalarType* const g_tMaxErrorAlphaBeta,
0532                                   ScalarType* const g_accTimeMax,
0533                                   ScalarType* const g_accTimeWgt,
0534                                   ScalarType* const sumAAsNullHypot,
0535                                   ScalarType* const sum0sNullHypot,
0536                                   ScalarType* const chi2sNullHypot,
0537                                   TimeComputationState* g_state,
0538                                   ScalarType* g_ampMaxAlphaBeta,
0539                                   ScalarType* g_ampMaxError,
0540                                   ScalarType* g_timeMax,
0541                                   ScalarType* g_timeError,
0542                                   EcalMultifitParameters const* paramsDev) const {
0543       /// launch ctx parameters are
0544       /// 10 threads per channel, N channels per block, Y blocks
0545       /// TODO: do we need to keep the state around or can be removed?!
0546       //#define DEBUG_FINDAMPLCHI2_AND_FINISH
0547 
0548       // constants
0549       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0550       auto const nchannels = digisDevEB.size() + digisDevEE.size();
0551       auto const offsetForInputs = digisDevEB.size();
0552 
0553       auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0554 
0555       // configure shared mem
0556       // per block, we need #threads per block * 2 * sizeof(ScalarType)
0557       // we run with N channels per block
0558       auto* shr_sumAf = alpaka::getDynSharedMem<ScalarType>(acc);
0559       auto* shr_sumff = shr_sumAf + elemsPerBlock;
0560 
0561       for (auto gtxforward : cms::alpakatools::uniform_elements(acc, nchannels * nsamples)) {
0562         // go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution
0563         auto gtx = nchannels * nsamples - 1 - gtxforward;
0564         auto const ch = gtx / nsamples;
0565         auto const elemIdx = gtx % elemsPerBlock;
0566         auto const sample = elemIdx % nsamples;
0567 
0568         auto const* dids = ch >= offsetForInputs ? digisDevEE.id() : digisDevEB.id();
0569         auto const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
0570 
0571         auto state = g_state[ch];
0572         auto const did = DetId{dids[inputCh]};
0573         auto* const amplitudeFitParameters = did.subdetId() == EcalBarrel ? paramsDev->amplitudeFitParamsEB.data()
0574                                                                           : paramsDev->amplitudeFitParamsEE.data();
0575 
0576         // TODO is that better than storing into global and launching another kernel
0577         // for the first 10 threads
0578         if (state == TimeComputationState::NotFinished) {
0579           auto const alpha = amplitudeFitParameters[0];
0580           auto const beta = amplitudeFitParameters[1];
0581           auto const alphabeta = alpha * beta;
0582           auto const invalphabeta = 1. / alphabeta;
0583           auto const tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
0584           auto const sample_value = sample_values[gtx];
0585           auto const sample_value_error = sample_value_errors[gtx];
0586           auto const inverr2 =
0587               useless_samples[gtx] ? static_cast<ScalarType>(0) : 1. / (sample_value_error * sample_value_error);
0588           auto const offset = (static_cast<ScalarType>(sample) - tMaxAlphaBeta) * invalphabeta;
0589           auto const term1 = 1. + offset;
0590           auto const f = term1 > 1e-6 ? fast_expf(alpha * (fast_logf(term1) - offset)) : static_cast<ScalarType>(0.);
0591           auto const sumAf = sample_value * (f * inverr2);
0592           auto const sumff = f * (f * inverr2);
0593 
0594           // store into shared mem
0595           shr_sumAf[elemIdx] = sumAf;
0596           shr_sumff[elemIdx] = sumff;
0597         } else {
0598           shr_sumAf[elemIdx] = 0;
0599           shr_sumff[elemIdx] = 0;
0600         }
0601 
0602         alpaka::syncBlockThreads(acc);
0603 
0604         // reduce
0605         // unroll completely here (but hardcoded)
0606         if (sample < 5) {
0607           shr_sumAf[elemIdx] += shr_sumAf[elemIdx + 5];
0608           shr_sumff[elemIdx] += shr_sumff[elemIdx + 5];
0609         }
0610 
0611         alpaka::syncBlockThreads(acc);
0612 
0613         if (sample < 2) {
0614           // will need to subtract for ltx = 3, we double count here
0615           shr_sumAf[elemIdx] += shr_sumAf[elemIdx + 2] + shr_sumAf[elemIdx + 3];
0616           shr_sumff[elemIdx] += shr_sumff[elemIdx + 2] + shr_sumff[elemIdx + 3];
0617         }
0618 
0619         alpaka::syncBlockThreads(acc);
0620 
0621         if (sample == 0) {
0622           // exit if the state is done
0623           // note, we do not exit before all __synchtreads are finished
0624           if (state == TimeComputationState::Finished) {
0625             g_timeMax[ch] = 5;
0626             g_timeError[ch] = -999;
0627             continue;
0628           }
0629 
0630           // subtract to avoid double counting
0631           auto const sumff = shr_sumff[elemIdx] + shr_sumff[elemIdx + 1] - shr_sumff[elemIdx + 3];
0632           auto const sumAf = shr_sumAf[elemIdx] + shr_sumAf[elemIdx + 1] - shr_sumAf[elemIdx + 3];
0633 
0634           auto const ampMaxAlphaBeta = sumff > 0 ? sumAf / sumff : 0;
0635           auto const sumAA = sumAAsNullHypot[ch];
0636           auto const sum0 = sum0sNullHypot[ch];
0637           auto const nullChi2 = chi2sNullHypot[ch];
0638           if (sumff > 0) {
0639             auto const chi2AlphaBeta = (sumAA - sumAf * sumAf / sumff) / sum0;
0640             if (chi2AlphaBeta > nullChi2) {
0641               // null hypothesis is better
0642               state = TimeComputationState::Finished;
0643 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0644               printf("ch = %d chi2AlphaBeta = %f nullChi2 = %f sumAA = %f sumAf = %f sumff = %f sum0 = %f\n",
0645                      ch,
0646                      chi2AlphaBeta,
0647                      nullChi2,
0648                      sumAA,
0649                      sumAf,
0650                      sumff,
0651                      sum0);
0652 #endif
0653             }
0654 
0655             // store to global
0656             g_ampMaxAlphaBeta[ch] = ampMaxAlphaBeta;
0657           } else {
0658 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0659             printf("ch = %d sum0 = %f sumAA = %f sumff = %f sumAf = %f\n", ch, sum0, sumAA, sumff, sumAf);
0660 #endif
0661             state = TimeComputationState::Finished;
0662           }
0663 
0664           // store the state to global and finish calcs
0665           g_state[ch] = state;
0666           if (state == TimeComputationState::Finished) {
0667             // store default values into global
0668             g_timeMax[ch] = 5;
0669             g_timeError[ch] = -999;
0670 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0671             printf("ch = %d finished state\n", ch);
0672 #endif
0673             continue;
0674           }
0675 
0676           auto const ampMaxError = g_ampMaxError[ch];
0677           auto const test_ratio = ampMaxAlphaBeta / ampMaxError;
0678           auto const accTimeMax = g_accTimeMax[ch];
0679           auto const accTimeWgt = g_accTimeWgt[ch];
0680           auto const tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
0681           auto const tMaxErrorAlphaBeta = g_tMaxErrorAlphaBeta[ch];
0682           // branch to separate large vs small pulses
0683           // see cpu version for more info
0684           if (test_ratio > 5. && accTimeWgt > 0) {
0685             auto const tMaxRatio = accTimeWgt > 0 ? accTimeMax / accTimeWgt : static_cast<ScalarType>(0);
0686             auto const tMaxErrorRatio = accTimeWgt > 0 ? 1. / std::sqrt(accTimeWgt) : static_cast<ScalarType>(0);
0687 
0688             if (test_ratio > 10.) {
0689               g_timeMax[ch] = tMaxRatio;
0690               g_timeError[ch] = tMaxErrorRatio;
0691 
0692 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0693               printf("ch = %d tMaxRatio = %f tMaxErrorRatio = %f\n", ch, tMaxRatio, tMaxErrorRatio);
0694 #endif
0695             } else {
0696               auto const timeMax = (tMaxAlphaBeta * (10. - ampMaxAlphaBeta / ampMaxError) +
0697                                     tMaxRatio * (ampMaxAlphaBeta / ampMaxError - 5.)) /
0698                                    5.;
0699               auto const timeError = (tMaxErrorAlphaBeta * (10. - ampMaxAlphaBeta / ampMaxError) +
0700                                       tMaxErrorRatio * (ampMaxAlphaBeta / ampMaxError - 5.)) /
0701                                      5.;
0702               state = TimeComputationState::Finished;
0703               g_state[ch] = state;
0704               g_timeMax[ch] = timeMax;
0705               g_timeError[ch] = timeError;
0706 
0707 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0708               printf("ch = %d timeMax = %f timeError = %f\n", ch, timeMax, timeError);
0709 #endif
0710             }
0711           } else {
0712             state = TimeComputationState::Finished;
0713             g_state[ch] = state;
0714             g_timeMax[ch] = tMaxAlphaBeta;
0715             g_timeError[ch] = tMaxErrorAlphaBeta;
0716 
0717 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
0718             printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f\n", ch, tMaxAlphaBeta, tMaxErrorAlphaBeta);
0719 #endif
0720           }
0721         }
0722       }
0723     }
0724   };
0725 
0726   class Kernel_time_compute_fixMGPAslew {
0727     using ScalarType = ::ecal::multifit::SampleVector::Scalar;
0728 
0729   public:
0730     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0731                                   EcalDigiDeviceCollection::ConstView digisDevEB,
0732                                   EcalDigiDeviceCollection::ConstView digisDevEE,
0733                                   EcalMultifitConditionsDevice::ConstView conditionsDev,
0734                                   ScalarType* sample_values,
0735                                   ScalarType* sample_value_errors,
0736                                   bool* useless_sample_values) const {
0737       // constants
0738       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0739 
0740       auto const nchannelsEB = digisDevEB.size();
0741       auto const offsetForInputs = nchannelsEB;
0742 
0743       auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0744 
0745       for (auto gtx : cms::alpakatools::uniform_elements(acc, nchannelsEB * nsamples)) {
0746         auto const elemIdx = gtx % elemsPerBlock;
0747         auto const sample = elemIdx % nsamples;
0748         auto const ch = gtx / nsamples;
0749 
0750         // remove thread for sample 0, oversubscribing is easier than ....
0751         if (sample == 0)
0752           continue;
0753 
0754         if (!use_sample(conditionsDev.sampleMask_EB(), sample))
0755           continue;
0756 
0757         int const inputGtx = ch >= offsetForInputs ? gtx - offsetForInputs * nsamples : gtx;
0758         auto const* digis = ch >= offsetForInputs ? digisDevEE.data()->data() : digisDevEB.data()->data();
0759 
0760         auto const gainIdPrev = ecalMGPA::gainId(digis[inputGtx - 1]);
0761         auto const gainIdNext = ecalMGPA::gainId(digis[inputGtx]);
0762         if (gainIdPrev >= 1 && gainIdPrev <= 3 && gainIdNext >= 1 && gainIdNext <= 3 && gainIdPrev < gainIdNext) {
0763           sample_values[gtx - 1] = 0;
0764           sample_value_errors[gtx - 1] = 1e+9;
0765           useless_sample_values[gtx - 1] = true;
0766         }
0767       }
0768     }
0769   };
0770 
0771   //#define ECAL_RECO_ALPAKA_TC_INIT_DEBUG
0772   class Kernel_time_computation_init {
0773     using ScalarType = ::ecal::multifit::SampleVector::Scalar;
0774 
0775   public:
0776     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0777                                   EcalDigiDeviceCollection::ConstView digisDevEB,
0778                                   EcalDigiDeviceCollection::ConstView digisDevEE,
0779                                   EcalMultifitConditionsDevice::ConstView conditionsDev,
0780                                   ScalarType* sample_values,
0781                                   ScalarType* sample_value_errors,
0782                                   ScalarType* ampMaxError,
0783                                   bool* useless_sample_values,
0784                                   char* pedestal_nums) const {
0785       // constants
0786       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0787 
0788       // indices
0789       auto const nchannelsEB = digisDevEB.size();
0790       auto const nchannels = nchannelsEB + digisDevEE.size();
0791       auto const offsetForInputs = nchannelsEB;
0792       auto const offsetForHashes = conditionsDev.offsetEE();
0793 
0794       auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
0795 
0796       // configure shared mem
0797       auto* shrSampleValues = alpaka::getDynSharedMem<ScalarType>(acc);
0798       auto* shrSampleValueErrors = shrSampleValues + elemsPerBlock;
0799 
0800       for (auto txforward : cms::alpakatools::uniform_elements(acc, nchannels * nsamples)) {
0801         // go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution
0802         auto tx = nchannels * nsamples - 1 - txforward;
0803         auto const ch = tx / nsamples;
0804         auto const elemIdx = tx % elemsPerBlock;
0805 
0806         int const inputTx = ch >= offsetForInputs ? tx - offsetForInputs * nsamples : tx;
0807         int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
0808         auto const* digis = ch >= offsetForInputs ? digisDevEE.data()->data() : digisDevEB.data()->data();
0809         auto const* dids = ch >= offsetForInputs ? digisDevEE.id() : digisDevEB.id();
0810 
0811         // indices/inits
0812         auto const sample = tx % nsamples;
0813         auto const input_ch_start = inputCh * nsamples;
0814         ScalarType pedestal = 0.;
0815         int num = 0;
0816 
0817         // 0 and 1 sample values
0818         auto const adc0 = ecalMGPA::adc(digis[input_ch_start]);
0819         auto const gainId0 = ecalMGPA::gainId(digis[input_ch_start]);
0820         auto const adc1 = ecalMGPA::adc(digis[input_ch_start + 1]);
0821         auto const gainId1 = ecalMGPA::gainId(digis[input_ch_start + 1]);
0822         auto const did = DetId{dids[inputCh]};
0823         auto const isBarrel = did.subdetId() == EcalBarrel;
0824         auto const sample_mask = isBarrel ? conditionsDev.sampleMask_EB() : conditionsDev.sampleMask_EE();
0825         auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
0826                                        : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
0827 
0828         // set pedestal
0829         // TODO this branch is non-divergent for a group of 10 threads
0830         if (gainId0 == 1 && use_sample(sample_mask, 0)) {
0831           pedestal = static_cast<ScalarType>(adc0);
0832           num = 1;
0833 
0834           auto const diff = adc1 - adc0;
0835           if (gainId1 == 1 && use_sample(sample_mask, 1) &&
0836               std::abs(diff) < 3 * conditionsDev.pedestals_rms_x12()[hashedId]) {
0837             pedestal = (pedestal + static_cast<ScalarType>(adc1)) / 2.;
0838             num = 2;
0839           }
0840         } else {
0841           pedestal = conditionsDev.pedestals_mean_x12()[ch];
0842         }
0843 
0844         // ped subtracted and gain-renormalized samples.
0845         auto const gainId = ecalMGPA::gainId(digis[inputTx]);
0846         auto const adc = ecalMGPA::adc(digis[inputTx]);
0847 
0848         bool bad = false;
0849         ScalarType sample_value, sample_value_error;
0850         // TODO divergent branch
0851         // TODO: piece below is general both for amplitudes and timing
0852         // potentially there is a way to reduce the amount of code...
0853         if (!use_sample(sample_mask, sample)) {
0854           bad = true;
0855           sample_value = 0;
0856           sample_value_error = 0;
0857         } else if (gainId == 1) {
0858           sample_value = static_cast<ScalarType>(adc) - pedestal;
0859           sample_value_error = conditionsDev.pedestals_rms_x12()[hashedId];
0860         } else if (gainId == 2) {
0861           auto const mean_x6 = conditionsDev.pedestals_mean_x6()[hashedId];
0862           auto const rms_x6 = conditionsDev.pedestals_rms_x6()[hashedId];
0863           auto const gain12Over6 = conditionsDev.gain12Over6()[hashedId];
0864           sample_value = (static_cast<ScalarType>(adc) - mean_x6) * gain12Over6;
0865           sample_value_error = rms_x6 * gain12Over6;
0866         } else if (gainId == 3) {
0867           auto const mean_x1 = conditionsDev.pedestals_mean_x1()[hashedId];
0868           auto const rms_x1 = conditionsDev.pedestals_rms_x1()[hashedId];
0869           auto const gain12Over6 = conditionsDev.gain12Over6()[hashedId];
0870           auto const gain6Over1 = conditionsDev.gain6Over1()[hashedId];
0871           sample_value = (static_cast<ScalarType>(adc) - mean_x1) * gain6Over1 * gain12Over6;
0872           sample_value_error = rms_x1 * gain6Over1 * gain12Over6;
0873         } else {
0874           sample_value = 0;
0875           sample_value_error = 0;
0876           bad = true;
0877         }
0878 
0879         // TODO: make sure we save things correctly when sample is useless
0880         auto const useless_sample = (sample_value_error <= 0) | bad;
0881         useless_sample_values[tx] = useless_sample;
0882         sample_values[tx] = sample_value;
0883         sample_value_errors[tx] = useless_sample ? 1e+9 : sample_value_error;
0884 
0885         // DEBUG
0886 #ifdef ECAL_RECO_ALPAKA_TC_INIT_DEBUG
0887         if (ch == 0) {
0888           printf("sample = %d sample_value = %f sample_value_error = %f useless = %c\n",
0889                  sample,
0890                  sample_value,
0891                  sample_value_error,
0892                  useless_sample ? '1' : '0');
0893         }
0894 #endif
0895 
0896         // store into the shared mem
0897         shrSampleValues[elemIdx] = sample_value_error > 0 ? sample_value : std::numeric_limits<ScalarType>::min();
0898         shrSampleValueErrors[elemIdx] = sample_value_error;
0899         alpaka::syncBlockThreads(acc);
0900 
0901         // perform the reduction with min
0902         if (sample < 5) {
0903           // note, if equal -> we keep the value with lower sample as for cpu
0904           shrSampleValueErrors[elemIdx] = shrSampleValues[elemIdx] < shrSampleValues[elemIdx + 5]
0905                                               ? shrSampleValueErrors[elemIdx + 5]
0906                                               : shrSampleValueErrors[elemIdx];
0907           shrSampleValues[elemIdx] = std::max(shrSampleValues[elemIdx], shrSampleValues[elemIdx + 5]);
0908         }
0909         alpaka::syncBlockThreads(acc);
0910 
0911         // a bit of an overkill, but easier than to compare across 3 values
0912         if (sample < 3) {
0913           shrSampleValueErrors[elemIdx] = shrSampleValues[elemIdx] < shrSampleValues[elemIdx + 3]
0914                                               ? shrSampleValueErrors[elemIdx + 3]
0915                                               : shrSampleValueErrors[elemIdx];
0916           shrSampleValues[elemIdx] = std::max(shrSampleValues[elemIdx], shrSampleValues[elemIdx + 3]);
0917         }
0918         alpaka::syncBlockThreads(acc);
0919 
0920         if (sample < 2) {
0921           shrSampleValueErrors[elemIdx] = shrSampleValues[elemIdx] < shrSampleValues[elemIdx + 2]
0922                                               ? shrSampleValueErrors[elemIdx + 2]
0923                                               : shrSampleValueErrors[elemIdx];
0924           shrSampleValues[elemIdx] = std::max(shrSampleValues[elemIdx], shrSampleValues[elemIdx + 2]);
0925         }
0926         alpaka::syncBlockThreads(acc);
0927 
0928         if (sample == 0) {
0929           // we only need the max error
0930           auto const maxSampleValueError = shrSampleValues[elemIdx] < shrSampleValues[elemIdx + 1]
0931                                                ? shrSampleValueErrors[elemIdx + 1]
0932                                                : shrSampleValueErrors[elemIdx];
0933 
0934           // # pedestal samples used
0935           pedestal_nums[ch] = num;
0936           // this is used downstream
0937           ampMaxError[ch] = maxSampleValueError;
0938 
0939           // DEBUG
0940 #ifdef ECAL_RECO_ALPAKA_TC_INIT_DEBUG
0941           if (ch == 0) {
0942             printf("pedestal_nums = %d ampMaxError = %f\n", num, maxSampleValueError);
0943           }
0944 #endif
0945         }
0946       }
0947     }
0948   };
0949 
0950   ///
0951   /// launch context parameters: 1 thread per channel
0952   ///
0953   //#define DEBUG_TIME_CORRECTION
0954   class Kernel_time_correction_and_finalize {
0955     using ScalarType = ::ecal::multifit::SampleVector::Scalar;
0956 
0957   public:
0958     ALPAKA_FN_ACC void operator()(Acc1D const& acc,
0959                                   EcalDigiDeviceCollection::ConstView digisDevEB,
0960                                   EcalDigiDeviceCollection::ConstView digisDevEE,
0961                                   EcalUncalibratedRecHitDeviceCollection::View uncalibRecHitsEB,
0962                                   EcalUncalibratedRecHitDeviceCollection::View uncalibRecHitsEE,
0963                                   EcalMultifitConditionsDevice::ConstView conditionsDev,
0964                                   ScalarType* const g_timeMax,
0965                                   ScalarType* const g_timeError,
0966                                   ConfigurationParameters::type const timeConstantTermEB,
0967                                   ConfigurationParameters::type const timeConstantTermEE,
0968                                   ConfigurationParameters::type const timeNconstEB,
0969                                   ConfigurationParameters::type const timeNconstEE,
0970                                   ConfigurationParameters::type const amplitudeThresholdEB,
0971                                   ConfigurationParameters::type const amplitudeThresholdEE,
0972                                   ConfigurationParameters::type const outOfTimeThreshG12pEB,
0973                                   ConfigurationParameters::type const outOfTimeThreshG12pEE,
0974                                   ConfigurationParameters::type const outOfTimeThreshG12mEB,
0975                                   ConfigurationParameters::type const outOfTimeThreshG12mEE,
0976                                   ConfigurationParameters::type const outOfTimeThreshG61pEB,
0977                                   ConfigurationParameters::type const outOfTimeThreshG61pEE,
0978                                   ConfigurationParameters::type const outOfTimeThreshG61mEB,
0979                                   ConfigurationParameters::type const outOfTimeThreshG61mEE) const {
0980       // constants
0981       constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
0982       auto const nchannelsEB = digisDevEB.size();
0983       auto const nchannels = nchannelsEB + digisDevEE.size();
0984       auto const offsetForInputs = nchannelsEB;
0985       auto const offsetForHashes = conditionsDev.offsetEE();
0986 
0987       for (auto gtx : cms::alpakatools::uniform_elements(acc, nchannels)) {
0988         const int inputGtx = gtx >= offsetForInputs ? gtx - offsetForInputs : gtx;
0989         auto const* dids = gtx >= offsetForInputs ? digisDevEE.id() : digisDevEB.id();
0990         auto const* digis = gtx >= offsetForInputs ? digisDevEE.data()->data() : digisDevEB.data()->data();
0991 
0992         auto* g_amplitude = gtx >= nchannelsEB ? uncalibRecHitsEE.amplitude() : uncalibRecHitsEB.amplitude();
0993         auto* g_jitter = gtx >= nchannelsEB ? uncalibRecHitsEE.jitter() : uncalibRecHitsEB.jitter();
0994         auto* g_jitterError = gtx >= nchannelsEB ? uncalibRecHitsEE.jitterError() : uncalibRecHitsEB.jitterError();
0995         auto* flags = gtx >= nchannelsEB ? uncalibRecHitsEE.flags() : uncalibRecHitsEB.flags();
0996 
0997         auto const did = DetId{dids[inputGtx]};
0998         auto const isBarrel = did.subdetId() == EcalBarrel;
0999         auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
1000                                        : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
1001         // need to access the underlying data directly here because the std::arrays have different size for EB and EE, which is not compatible with the ? operator
1002         auto* const amplitudeBins = isBarrel ? conditionsDev.timeBiasCorrections_amplitude_EB().data()
1003                                              : conditionsDev.timeBiasCorrections_amplitude_EE().data();
1004         auto* const shiftBins = isBarrel ? conditionsDev.timeBiasCorrections_shift_EB().data()
1005                                          : conditionsDev.timeBiasCorrections_shift_EE().data();
1006         auto const amplitudeBinsSize =
1007             isBarrel ? conditionsDev.timeBiasCorrectionSizeEB() : conditionsDev.timeBiasCorrectionSizeEE();
1008         auto const timeConstantTerm = isBarrel ? timeConstantTermEB : timeConstantTermEE;
1009         auto const timeNconst = isBarrel ? timeNconstEB : timeNconstEE;
1010         auto const offsetTimeValue = isBarrel ? conditionsDev.timeOffset_EB() : conditionsDev.timeOffset_EE();
1011         auto const amplitudeThreshold = isBarrel ? amplitudeThresholdEB : amplitudeThresholdEE;
1012         auto const outOfTimeThreshG12p = isBarrel ? outOfTimeThreshG12pEB : outOfTimeThreshG12pEE;
1013         auto const outOfTimeThreshG12m = isBarrel ? outOfTimeThreshG12mEB : outOfTimeThreshG12mEE;
1014         auto const outOfTimeThreshG61p = isBarrel ? outOfTimeThreshG61pEB : outOfTimeThreshG61pEE;
1015         auto const outOfTimeThreshG61m = isBarrel ? outOfTimeThreshG61mEB : outOfTimeThreshG61mEE;
1016 
1017         // load some
1018         auto const amplitude = g_amplitude[inputGtx];
1019         auto const rms_x12 = conditionsDev.pedestals_rms_x12()[hashedId];
1020         auto const timeCalibConst = conditionsDev.timeCalibConstants()[hashedId];
1021 
1022         int myBin = -1;
1023         for (size_t bin = 0; bin < amplitudeBinsSize; ++bin) {
1024           if (amplitude > amplitudeBins[bin])
1025             myBin = bin;
1026           else
1027             break;
1028         }
1029 
1030         ScalarType correction = 0;
1031         if (myBin == -1) {
1032           correction = shiftBins[0];
1033         } else if (myBin == static_cast<int>(amplitudeBinsSize) - 1) {
1034           correction = shiftBins[myBin];
1035         } else {
1036           correction = shiftBins[myBin + 1] - shiftBins[myBin];
1037           correction *= (amplitude - amplitudeBins[myBin]) / (amplitudeBins[myBin + 1] - amplitudeBins[myBin]);
1038           correction += shiftBins[myBin];
1039         }
1040 
1041         // correction * 1./25.
1042         correction = correction * 0.04;
1043         auto const timeMax = g_timeMax[gtx];
1044         auto const timeError = g_timeError[gtx];
1045         auto const jitter = timeMax - 5 + correction;
1046         auto const jitterError =
1047             std::sqrt(timeError * timeError + timeConstantTerm * timeConstantTerm * 0.04 * 0.04);  // 0.04 = 1./25.
1048 
1049 #ifdef DEBUG_TIME_CORRECTION
1050         printf("ch = %d timeMax = %f timeError = %f jitter = %f correction = %f\n",
1051                gtx,
1052                timeMax,
1053                timeError,
1054                jitter,
1055                correction);
1056 #endif
1057 
1058         // store back to  global
1059         g_jitter[inputGtx] = jitter;
1060         g_jitterError[inputGtx] = jitterError;
1061 
1062         // set the flag
1063         // TODO: replace with something more efficient (if required),
1064         // for now just to make it work
1065         if (amplitude > amplitudeThreshold * rms_x12) {
1066           auto threshP = outOfTimeThreshG12p;
1067           auto threshM = outOfTimeThreshG12m;
1068           if (amplitude > 3000.) {
1069             for (int isample = 0; isample < nsamples; isample++) {
1070               auto const gainid = ecalMGPA::gainId(digis[nsamples * inputGtx + isample]);
1071               if (gainid != 1) {
1072                 threshP = outOfTimeThreshG61p;
1073                 threshM = outOfTimeThreshG61m;
1074                 break;
1075               }
1076             }
1077           }
1078 
1079           auto const correctedTime = (timeMax - 5) * 25 + timeCalibConst + offsetTimeValue;
1080           auto const nterm = timeNconst * rms_x12 / amplitude;
1081           auto const sigmat = std::sqrt(nterm * nterm + timeConstantTerm * timeConstantTerm);
1082           if (correctedTime > sigmat * threshP || correctedTime < -sigmat * threshM)
1083             flags[inputGtx] |= 0x1 << EcalUncalibratedRecHit::kOutOfTime;
1084         }
1085       }
1086     }
1087   };
1088 
1089 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit
1090 
1091 namespace alpaka::trait {
1092   using namespace ALPAKA_ACCELERATOR_NAMESPACE;
1093   using namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit;
1094 
1095   //! The trait for getting the size of the block shared dynamic memory for Kernel_time_compute_nullhypot.
1096   template <>
1097   struct BlockSharedMemDynSizeBytes<Kernel_time_compute_nullhypot, Acc1D> {
1098     //! \return The size of the shared memory allocated for a block.
1099     template <typename TVec, typename... TArgs>
1100     ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_time_compute_nullhypot const&,
1101                                                                  TVec const& threadsPerBlock,
1102                                                                  TVec const& elemsPerThread,
1103                                                                  TArgs const&...) -> std::size_t {
1104       using ScalarType = ::ecal::multifit::SampleVector::Scalar;
1105 
1106       // return the amount of dynamic shared memory needed
1107       std::size_t bytes = threadsPerBlock[0u] * elemsPerThread[0u] * 4 * sizeof(ScalarType);
1108       return bytes;
1109     }
1110   };
1111 
1112   //! The trait for getting the size of the block shared dynamic memory for Kernel_time_compute_makeratio.
1113   template <>
1114   struct BlockSharedMemDynSizeBytes<Kernel_time_compute_makeratio, Acc1D> {
1115     template <typename TVec, typename... TArgs>
1116     ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_time_compute_makeratio const&,
1117                                                                  TVec const& threadsPerBlock,
1118                                                                  TVec const& elemsPerThread,
1119                                                                  TArgs const&...) -> std::size_t {
1120       using ScalarType = ::ecal::multifit::SampleVector::Scalar;
1121 
1122       std::size_t bytes = (8 * sizeof(ScalarType) + 3 * sizeof(bool)) * threadsPerBlock[0u] * elemsPerThread[0u];
1123       return bytes;
1124     }
1125   };
1126 
1127   //! The trait for getting the size of the block shared dynamic memory for Kernel_time_compute_findamplchi2_and_finish.
1128   template <>
1129   struct BlockSharedMemDynSizeBytes<Kernel_time_compute_findamplchi2_and_finish, Acc1D> {
1130     template <typename TVec, typename... TArgs>
1131     ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_time_compute_findamplchi2_and_finish const&,
1132                                                                  TVec const& threadsPerBlock,
1133                                                                  TVec const& elemsPerThread,
1134                                                                  TArgs const&...) -> std::size_t {
1135       using ScalarType = ::ecal::multifit::SampleVector::Scalar;
1136 
1137       std::size_t bytes = 2 * threadsPerBlock[0u] * elemsPerThread[0u] * sizeof(ScalarType);
1138       return bytes;
1139     }
1140   };
1141 
1142   //! The trait for getting the size of the block shared dynamic memory for Kernel_time_computation_init.
1143   template <>
1144   struct BlockSharedMemDynSizeBytes<Kernel_time_computation_init, Acc1D> {
1145     template <typename TVec, typename... TArgs>
1146     ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_time_computation_init const&,
1147                                                                  TVec const& threadsPerBlock,
1148                                                                  TVec const& elemsPerThread,
1149                                                                  TArgs const&...) -> std::size_t {
1150       using ScalarType = ::ecal::multifit::SampleVector::Scalar;
1151 
1152       std::size_t bytes = 2 * threadsPerBlock[0u] * elemsPerThread[0u] * sizeof(ScalarType);
1153       return bytes;
1154     }
1155   };
1156 
1157 }  // namespace alpaka::trait
1158 
1159 #endif  // RecoLocalCalo_EcalRecProducers_plugins_TimeComputationKernels_h